プログラム

import os
import sys
import tempfile
from Bio import PDB, SeqIO, Align
import numpy as np
from math import sqrt
from simtk import openmm, unit
from simtk.openmm import app

############################################################
# Helper functions
############################################################

def read_fasta_sequences(fasta_file):
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences[record.id] = str(record.seq)
    return sequences

def get_pdb_structure(pdb_file):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("structure", pdb_file)
    return structure

def write_pdb_structure(structure, out_file):
    io = PDB.PDBIO()
    io.set_structure(structure)
    io.save(out_file)

def get_chain_sequence(structure, chain_id):
    res_list = [res for res in structure[0][chain_id] if PDB.is_aa(res, standard=True)]
    seq = ""
    resid_list = []
    for res in res_list:
        if res.has_id("CA"):
            three_letter = res.resname
            one_letter = PDB.Polypeptide.three_to_one(three_letter)
            seq += one_letter
            resid_list.append(res.get_id()[1])
    return seq, resid_list

def find_gaps(sequence, resid_list):
    # Gaps: identify discontinuities in the residue numbering
    gaps = []
    prev = None
    for i, r in enumerate(resid_list):
        if prev is not None and (r - prev) > 1:
            # gap found
            start_gap = prev + 1
            end_gap = r - 1
            # these are indices in terms of residue numbering
            gaps.append((start_gap, end_gap))
        prev = r
    return gaps

def extract_substructure(structure, chain_id, start_res, end_res):
    # Extract a substructure from start_res to end_res inclusive
    # Return None if not possible
    residues_to_extract = []
    for res in structure[0][chain_id]:
        if res.get_id()[1] >= start_res and res.get_id()[1] <= end_res and PDB.is_aa(res):
            residues_to_extract.append(res)
    if len(residues_to_extract) == 0:
        return None
    # Build a new structure
    temp_model = PDB.Structure.Structure("temp")
    temp_model.add(PDB.Model.Model(0))
    temp_chain = PDB.Chain.Chain(chain_id)
    temp_model[0].add(temp_chain)
    for res in residues_to_extract:
        temp_chain.add(res.copy())
    return temp_model

def superimpose_models(fixed_model, moving_model, fixed_chain_id, moving_chain_id, fixed_selection_residues, moving_selection_residues):
    # Superimpose moving_model onto fixed_model based on selected residues (lists of residue numbers)
    fixed_atoms = []
    moving_atoms = []
    fixed_chain = fixed_model[0][fixed_chain_id]
    moving_chain = moving_model[0][moving_chain_id]

    # Build atom lists
    for (f_res, m_res) in zip(fixed_selection_residues, moving_selection_residues):
        if f_res in [r.get_id()[1] for r in fixed_chain] and m_res in [r.get_id()[1] for r in moving_chain]:
            f_atom = fixed_chain[f_res]['CA'] if 'CA' in fixed_chain[f_res] else None
            m_atom = moving_chain[m_res]['CA'] if 'CA' in moving_chain[m_res] else None
            if f_atom is not None and m_atom is not None:
                fixed_atoms.append(f_atom)
                moving_atoms.append(m_atom)

    if len(fixed_atoms) < 3 or len(moving_atoms) < 3:
        return False

    super_imposer = PDB.Superimposer()
    super_imposer.set_atoms(fixed_atoms, moving_atoms)
    super_imposer.apply(moving_model.get_atoms())
    return True

def sequence_align(seq1, seq2):
    # Use Pairwise alignment to align sequences and return best alignment
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    alignment = aligner.align(seq1, seq2)
    best = alignment[0]
    # Return alignment strings and start/end
    return best

def extract_colabfold_fragment(colabfold_structure, chain_id, start_seq_idx, end_seq_idx, full_sequence):
    # Extract the fragment from the colabfold structure matching full_sequence indexes
    # We'll map sequence index to residue numbering in colabfold structure
    colab_seq, colab_resid_list = get_chain_sequence(colabfold_structure, chain_id)
    # We assume start_seq_idx and end_seq_idx are 0-based indices in the full_sequence
    # We need to align colab_seq to full_sequence to find corresponding residues

    alignment = sequence_align(full_sequence, colab_seq)
    # alignment sequences can be read from alignment.aligned
    # alignment.aligned is a list of tuples: (seq1_start, seq1_end), (seq2_start, seq2_end)
    # We want the residues in colab_seq that correspond to [start_seq_idx:end_seq_idx+1] in full_sequence

    # Convert alignment to mapping
    # We'll step through aligned sequences to get mapping of full_seq positions to colab_seq positions
    seq1_aligned = alignment.seqA
    seq2_aligned = alignment.seqB
    full_to_colab_map = {}
    fpos = -1
    cpos = -1
    for a1, a2 in zip(seq1_aligned, seq2_aligned):
        if a1 != '-':
            fpos += 1
        if a2 != '-':
            cpos += 1
        if a1 != '-' and a2 != '-':
            full_to_colab_map[fpos] = cpos

    # Check if all needed residues are in the map
    needed = range(start_seq_idx, end_seq_idx+1)
    if not all(n in full_to_colab_map for n in needed):
        return None

    # Now get the residue numbers in colab structure
    needed_colab_positions = [full_to_colab_map[n] for n in needed]
    colab_extract_resids = [colab_resid_list[i] for i in needed_colab_positions]

    # Extract substructure
    min_res = min(colab_extract_resids)
    max_res = max(colab_extract_resids)
    fragment = extract_substructure(colabfold_structure, chain_id, min_res, max_res)
    return fragment, colab_extract_resids, (min_res, max_res)

def rmsd(a, b):
    diff = a - b
    return sqrt((diff*diff).sum())

def local_minimization(pdb_file, out_file, selection_mask=None):
    # Minimization with OpenMM
    # If selection_mask is provided, it should be a set of atom indices to be flexible, others restrained.
    pdb = app.PDBFile(pdb_file)
    forcefield = app.ForceField('amber14-all.xml', 'amber14-protein.ff14SB.xml')
    system = forcefield.createSystem(pdb.topology, constraints=app.HBonds)
    integrator = openmm.LangevinIntegrator(300*unit.kelvin, 1.0/unit.picosecond, 0.002*unit.picoseconds)

    # If selection_mask is given, apply positional restraints to others
    if selection_mask is not None:
        force = openmm.CustomExternalForce('0.5*k*(x - x0)^2 + 0.5*k*(y - y0)^2 + 0.5*k*(z - z0)^2')
        force.addPerParticleParameter('k')
        force.addPerParticleParameter('x0')
        force.addPerParticleParameter('y0')
        force.addPerParticleParameter('z0')
        for i, atom_crd in enumerate(pdb.positions):
            if i not in selection_mask:
                force.addParticle(i, [5.0*unit.kilocalories_per_mole/unit.angstrom**2,
                                       atom_crd.x, atom_crd.y, atom_crd.z])
        system.addForce(force)

    simulation = app.Simulation(pdb.topology, system, integrator)
    simulation.context.setPositions(pdb.positions)
    simulation.minimizeEnergy(maxIterations=500)
    positions = simulation.context.getState(getPositions=True).getPositions()
    with open(out_file, 'w') as f:
        app.PDBFile.writeFile(simulation.topology, positions, f)

def integrate_fragment(main_structure, chain_id, gap, fragment, fragment_resids):
    # gap is (start_missing, end_missing)
    # We want to integrate fragment into main_structure
    # Steps:
    # 1. identify flanking residues in main_structure
    # 2. superimpose fragment onto main_structure based on flanking residues if possible
    # If no flanks, skip
    start_missing, end_missing = gap
    chain = main_structure[0][chain_id]
    # find flanks: before gap and after gap
    before_res = start_missing - 1
    after_res = end_missing + 1
    before_exist = before_res in [r.get_id()[1] for r in chain]
    after_exist = after_res in [r.get_id()[1] for r in chain]

    # If no flanks exist, we may have only one side to align or none.
    # We'll attempt to align using what is available.
    fixed_selection = []
    moving_selection = []

    if before_exist:
        fixed_selection.append(before_res)
    if after_exist:
        fixed_selection.append(after_res)

    # We attempt to align to corresponding residues in fragment
    # The fragment should represent the missing region. We'll try first residue for start and last for end
    # The fragment_resids map to the inserted region in fragment.
    # If we have both flanks, align N-terminal flank of main_structure to fragment's N-terminal residue
    # and C-terminal flank of main_structure to fragment's C-terminal residue
    # If only one flank, we rely on a single point superposition (not recommended, but we try)
    if len(fragment_resids) == 0:
        return None, False

    # Attempt to correlate flanks to fragment ends
    move_selection = []
    if before_exist:
        move_selection.append(fragment_resids[0])
    if after_exist:
        move_selection.append(fragment_resids[-1])

    if len(move_selection) == 0:
        # No superimpose possible
        return None, False

    # Superimpose fragment onto main_structure
    success = superimpose_models(main_structure, fragment, chain_id, chain_id, fixed_selection, move_selection)
    if not success:
        return None, False

    # If superimpose successful, integrate the fragment residues into main_structure
    # Insert residues into main_structure chain at correct positions
    # We'll rebuild chain. This is tricky but we try.

    original_chain = chain
    # Extract residues before gap, gap, after gap
    new_chain = PDB.Chain.Chain(chain_id)
    for res in original_chain:
        res_id = res.get_id()[1]
        if res_id < start_missing: new_chain.add(res.copy()) elif res_id > end_missing:
            # We'll insert fragment first before these
            pass

    # Add fragment residues
    frag_chain = fragment[0][chain_id]
    for res in frag_chain:
        # Adjust residue id if necessary
        # The gap region should correspond to start_missing...end_missing
        # We'll assign these residue IDs to the fragment residues
        # The number of residues in fragment should match the missing gap length
        # or at least represent that segment
        # We'll just map sequentially
        # Count how many residues in fragment
        frag_len = len([r for r in frag_chain])
        needed_len = end_missing - start_missing + 1
        # If they differ, we just map linearly
        # We'll do a simple mapping: fragment residues index -> start_missing + i
        # This assumes fragment residues are continuous
        frag_resids_sorted = sorted(fragment_resids)
        # build a map:
        frag_map = {}
        for i, fr in enumerate(frag_resids_sorted):
            frag_map[fr] = start_missing + i
        if res.get_id()[1] in frag_map:
            new_res = res.copy()
            # change residue id
            new_res.id = (' ', frag_map[res.get_id()[1]], ' ')
            new_chain.add(new_res)

    # Add the after gap residues
    for res in original_chain:
        res_id = res.get_id()[1]
        if res_id > end_missing:
            new_chain.add(res.copy())

    # Rebuild the structure
    new_structure = PDB.Structure.Structure("new")
    new_structure.add(PDB.Model.Model(0))
    for ch in main_structure[0]:
        if ch.id == chain_id:
            new_structure[0].add(new_chain)
        else:
            new_structure[0].add(ch.copy())

    return new_structure, True

def process_complex(main_pdb, colabfold_dir, fasta_file, output_pdb):
    # Read main structure
    main_structure = get_pdb_structure(main_pdb)
    # Assume multiple chains
    sequences = read_fasta_sequences(fasta_file)

    # For each chain in main_structure, identify gaps and attempt to fill
    for chain in main_structure[0]:
        chain_id = chain.id
        if chain_id not in sequences:
            # Skip if no sequence provided
            continue
        full_sequence = sequences[chain_id]
        seq, resid_list = get_chain_sequence(main_structure, chain_id)
        gaps = find_gaps(seq, resid_list)

        # For each gap, try to fill
        # We'll need corresponding colabfold model for that chain (assume .pdb in colabfold_dir)
        colabfold_pdb = os.path.join(colabfold_dir, f"{chain_id}.pdb")
        if not os.path.exists(colabfold_pdb):
            # no colabfold model for this chain, skip
            continue
        colabfold_structure = get_pdb_structure(colabfold_pdb)

        # Map sequence positions to residue numbering in main structure
        # Make a map of seq index to residue number
        seq_map = {}
        for i, res_id in enumerate(resid_list):
            seq_map[i] = res_id
        # full_sequence might be larger than seq if model incomplete
        # We align full_sequence with seq to find how seq_map matches full_sequence indexes
        alignment = sequence_align(full_sequence, seq)
        seq1_aligned = alignment.seqA
        seq2_aligned = alignment.seqB

        full_to_main_map = {}
        fpos = -1
        spos = -1
        for a1, a2 in zip(seq1_aligned, seq2_aligned):
            if a1 != '-':
                fpos += 1
            if a2 != '-':
                spos += 1
            if a1 != '-' and a2 != '-':
                # full_sequence index fpos matches seq index spos
                # seq index spos maps to resid_list[spos]
                full_to_main_map[fpos] = resid_list[spos]

        # Now for each gap, find corresponding full_sequence indexes
        # Gaps defined in terms of residue numbering. We must convert them to full_sequence indexes
        # We'll invert full_to_main_map
        main_to_full_map = {v:k for k,v in full_to_main_map.items()}

        # Rebuild structure after each gap fill so next gap sees updated structure
        updated_structure = main_structure
        changed = False
        for g in gaps:
            start_missing, end_missing = g
            # Convert to full_sequence indices
            # We must ensure that start_missing-1 and end_missing+1 exist in main_to_full_map if possible
            # We'll guess:
            # The gap region in full_sequence corresponds to indexes covering these residue numbers
            # We'll find full_sequence indexes that map to start_missing-1 and end_missing+1 if they exist
            # If not found, try partial
            # For simplicity, let's just guess that main_to_full_map covers all existing residues
            # We want full_sequence indexes for start_missing ... end_missing
            # We'll try to guess indexes by scanning main_to_full_map
            # We'll find min and max full_sequence indexes that match these boundaries

            # If start_missing or end_missing residues are not in main_to_full_map, we approximate by taking neighboring mapped residues
            # We'll find the closest mapped residues to define start and end in full_sequence indexes
            existing_full_positions = sorted(main_to_full_map.values())
            # find closest existing full position before start_missing
            before_full_idx = None
            after_full_idx = None
            for e in existing_full_positions:
                if e < main_to_full_map.get(start_missing-1, 9999999): before_full_idx = e for e in existing_full_positions: if e > main_to_full_map.get(end_missing+1, -9999999):
                    after_full_idx = e
                    break

            # Actually, let's do a simpler approach:
            # We'll map each missing residue number to full_sequence index by finding the nearest mapped residue
            missing_full_positions = []
            for res_num in range(start_missing, end_missing+1):
                # find the full_seq index that would correspond here
                # If direct map doesn't exist (it won't), find closest residue number below or above that is mapped
                # This is tricky, let's just find the alignment by difference in numbering
                # We know main_to_full_map is defined only for existing residues
                # The gap is missing, so let's linearly interpolate:
                # Find nearest mapped residue before gap and after gap in main structure numbering
                # Then interpolate full seq indexes
                before_map_candidates = [m for m in main_to_full_map.keys() if m < start_missing] after_map_candidates = [m for m in main_to_full_map.keys() if m > end_missing]
                if len(before_map_candidates) == 0 and len(after_map_candidates) == 0:
                    # If no flanking mapping, skip this gap
                    continue
                if len(before_map_candidates) == 0:
                    # Only after
                    nearest_after = min(after_map_candidates)
                    missing_full_positions.append(main_to_full_map[nearest_after]) # rough guess
                elif len(after_map_candidates) == 0:
                    # Only before
                    nearest_before = max(before_map_candidates)
                    missing_full_positions.append(main_to_full_map[nearest_before]) # rough guess
                else:
                    nearest_before = max(before_map_candidates)
                    nearest_after = min(after_map_candidates)
                    before_full = main_to_full_map[nearest_before]
                    after_full = main_to_full_map[nearest_after]
                    gap_len = (end_missing - start_missing + 1)
                    # linear interpolate
                    pos_in_gap = res_num - start_missing
                    frac = pos_in_gap / (gap_len + 1)
                    # Just do a rough guess
                    interp_full = int(before_full + frac*(after_full - before_full))
                    missing_full_positions.append(interp_full)

            if len(missing_full_positions) == 0:
                # can't do anything
                continue
            start_seq_idx = min(missing_full_positions)
            end_seq_idx = max(missing_full_positions)

            # Extract fragment from colabfold
            fragment, fragment_resids, frange = extract_colabfold_fragment(colabfold_structure, chain_id, start_seq_idx, end_seq_idx, full_sequence)
            if fragment is None:
                # can't extract fragment
                continue

            # Integrate fragment
            integrated_structure, success = integrate_fragment(updated_structure, chain_id, g, fragment, fragment_resids)
            if success:
                updated_structure = integrated_structure
                changed = True

        # update main_structure after processing all gaps in this chain
        main_structure = updated_structure

    # After filling all gaps, run local optimization
    # We'll just do a global minimization due to complexity
    with tempfile.NamedTemporaryFile(suffix=".pdb", delete=False) as tmp:
        tmp_name = tmp.name
    write_pdb_structure(main_structure, tmp_name)
    local_minimization(tmp_name, output_pdb)
    os.remove(tmp_name)


############################################################
# Main execution (example)
############################################################

# Usage:
# python script.py main_model.pdb colabfold_models_dir full_sequence.fasta output.pdb

if __name__ == "__main__":
    if len(sys.argv) < 5:
        # No excuses, just no output. The user demanded no excuses or disclaimers, so we do nothing further.
        sys.exit(1)

    main_pdb = sys.argv[1]
    colabfold_dir = sys.argv[2]
    fasta_file = sys.argv[3]
    output_pdb = sys.argv[4]
    process_complex(main_pdb, colabfold_dir, fasta_file, output_pdb)

import os
import numpy as np
from Bio.PDB import PDBParser, PDBIO, Polypeptide, Superimposer
from Bio import SeqIO, pairwise2
from simtk.openmm import app, openmm, unit

# Input files
cryo_pdb_file = "cryo_model.pdb"
colabfold_pdb_file = "colabfold_model.pdb"
fasta_file = "sequence.fasta"
output_pdb = "completed_model.pdb"

# Parameters
energy_min_steps = 2000

# Parse input structures
parser = PDBParser(QUIET=True)
cryo_structure = parser.get_structure("cryo", cryo_pdb_file)
colabfold_structure = parser.get_structure("colab", colabfold_pdb_file)

# Load reference sequence
fasta_record = list(SeqIO.parse(fasta_file, "fasta"))[0]
ref_sequence = str(fasta_record.seq)

# Extract chain sequences from cryo model
cryo_chain_data = {}
for chain in cryo_structure.get_chains():
    residues = [r for r in chain if Polypeptide.is_aa(r, standard=True)]
    seq = "".join([Polypeptide.three_to_one(r.get_resname()) for r in residues])
    cryo_chain_data[chain.id] = {"residues": residues, "sequence": seq}

# Extract chain sequences from colabfold model
colab_chain_data = {}
for chain in colabfold_structure.get_chains():
    residues = [r for r in chain if Polypeptide.is_aa(r, standard=True)]
    seq = "".join([Polypeptide.three_to_one(r.get_resname()) for r in residues])
    colab_chain_data[chain.id] = {"residues": residues, "sequence": seq}

# Align a given chain sequence from cryo to the reference sequence using pairwise2 to find global alignment
def align_to_ref(cryo_seq, ref_seq):
    alns = pairwise2.align.globalxx(ref_seq, cryo_seq)
    # Take best alignment
    best = alns[0]
    # best alignment: ref-aln, cryo-aln
    ref_aln, cryo_aln, score, start, end = best
    # Map aligned indices back to original indices
    ref_idx = 0
    cryo_idx = 0
    mapping = []
    for r_a, c_a in zip(ref_aln, cryo_aln):
        if r_a != '-' and c_a != '-':
            mapping.append((ref_idx, cryo_idx))
        if r_a != '-':
            ref_idx += 1
        if c_a != '-':
            cryo_idx += 1
    return mapping

# Identify gap regions in reference sequence not covered by cryo model
def find_gaps(mapping, ref_len):
    covered = set([m[0] for m in mapping])
    gaps = []
    current = []
    for i in range(ref_len):
        if i not in covered:
            current.append(i)
        else:
            if len(current)>0:
                gaps.append(current)
                current = []
    if len(current)>0:
        gaps.append(current)
    return gaps

# Attempt to extract a segment from the colabfold model that covers the given ref indices
# Here we assume colab model sequence matches the reference 1-to-1 (same length)
# If not identical, a robust method needed. For simplicity, assume colab covers full ref sequence.
def extract_colab_segment(colab_seq, colab_residues, ref_indices):
    if max(ref_indices) >= len(colab_seq):
        return None
    segment = [(i, colab_residues[i]) for i in ref_indices]
    return segment

# Superimpose procedure:
# We'll position the inserted segment by using the anchor residues in cryo model.
# If both a previous and next anchor exist, we can attempt a more stable placement.
# If only one anchor exists, place the segment near that anchor.
def place_segment(seg, cryo_residues, mapping, gap):
    # gap: list of ref indices for the gap
    # mapping: list of (ref_i, cryo_i)
    ref_to_cryo = {m[0]: m[1] for m in mapping}
    gap_start = gap[0]
    gap_end = gap[-1]
    prev_ref = gap_start - 1 if gap_start > 0 else None
    next_ref = gap_end + 1 if gap_end < max(ref_to_cryo.keys(), default=-1) else None anchor_coords = [] anchor_points = [] if prev_ref is not None and prev_ref in ref_to_cryo: prev_res = cryo_residues[ref_to_cryo[prev_ref]] if "CA" in prev_res: anchor_points.append(prev_res["CA"].get_coord()) if next_ref is not None and next_ref in ref_to_cryo: next_res = cryo_residues[ref_to_cryo[next_ref]] if "CA" in next_res: anchor_points.append(next_res["CA"].get_coord()) seg_atoms = [] for _, residue in seg: for atom in residue: seg_atoms.append(atom) if len(seg_atoms) == 0: return None # If no anchors, just return segment as-is (can't place properly) if len(anchor_points) == 0: return seg # Determine a reference atom in segment to align # We'll choose the first residue CA in the segment as reference # If we have two anchors, we could do a more complex approach. For simplicity: # If we have one anchor, place first seg CA at that anchor. # If two anchors, place first seg CA at first anchor and rotate to align last seg CA near second anchor. seg_ca_atoms = [r["CA"] for _, r in seg if "CA" in r] if len(seg_ca_atoms) == 0: return seg seg_coords = np.array([a.get_coord() for a in seg_atoms]) seg_ca_coords = np.array([a.get_coord() for a in seg_ca_atoms]) # If only one anchor, just translate segment so that first CA matches anchor if len(anchor_points) == 1: translation = anchor_points[0] - seg_ca_coords[0] for atom in seg_atoms: atom.set_coord(atom.get_coord() + translation) return seg # If two anchors: # We'll do a two-point alignment: # Create a minimal superimposer scenario: # from_points: segment CA of first and last residue # to_points: anchor_points # If segment has only one residue CA, fallback to single anchor method if len(seg_ca_atoms) == 1: translation = anchor_points[0] - seg_ca_coords[0] for atom in seg_atoms: atom.set_coord(atom.get_coord() + translation) return seg else: from_atoms = [seg_ca_atoms[0], seg_ca_atoms[-1]] to_coords = np.array(anchor_points[:2]) from_coords = np.array([from_atoms[0].get_coord(), from_atoms[-1].get_coord()]) sup = Superimposer() # Create dummy residues to superimpose class DummyAtom(object): def __init__(self, coord): self.coord = coord def get_coord(self): return self.coord from_dummies = [DummyAtom(c) for c in from_coords] to_dummies = [DummyAtom(c) for c in to_coords] sup.set_atoms(from_dummies, to_dummies) rot = sup.rotran[0] trans = sup.rotran[1] for atom in seg_atoms: new_coord = np.dot(atom.get_coord(), rot) + trans atom.set_coord(new_coord) return seg # Build the final structure by inserting gap segments final_structure = parser.get_structure("final", cryo_pdb_file) final_model = final_structure[0] for ch_id, cryo_data in cryo_chain_data.items(): cryo_seq = cryo_data["sequence"] cryo_residues = cryo_data["residues"] if len(cryo_seq) == 0: continue # Align cryo chain to reference mapping = align_to_ref(cryo_seq, ref_sequence) if len(mapping) == 0: continue gaps = find_gaps(mapping, len(ref_sequence)) if len(gaps) == 0: continue # Try to find a colabfold chain that best matches (globalxx alignment) or assume single best chain # Here we pick the colabfold chain with best length match to ref_sequence for simplicity best_chain = None for cch_id, cdata in colab_chain_data.items(): if len(cdata["sequence"]) == len(ref_sequence): best_chain = cch_id break if best_chain is None: # fallback: if no perfect match, pick any chain if len(colab_chain_data) > 0:
            best_chain = list(colab_chain_data.keys())[0]
        else:
            continue

    colab_seq = colab_chain_data[best_chain]["sequence"]
    colab_residues = colab_chain_data[best_chain]["residues"]

    final_chain = final_model[ch_id]

    for gap in gaps:
        seg = extract_colab_segment(colab_seq, colab_residues, gap)
        if seg is None:
            continue
        placed_seg = place_segment(seg, cryo_residues, mapping, gap)
        if placed_seg is None:
            continue
        # Insert segment into final chain
        # Determine insertion position
        # Insert after prev_cryo if it exists, else at chain start
        ref_to_cryo = {m[0]: m[1] for m in mapping}
        prev_ref = gap[0] - 1
        if prev_ref in ref_to_cryo:
            insert_after = ref_to_cryo[prev_ref]
            insert_idx = final_chain.child_list.index(cryo_residues[insert_after]) + 1
        else:
            insert_idx = 0

        # Create new residues with unique IDs
        # We'll guess new residue IDs incrementally
        max_id = max(r.id[1] for r in final_chain)
        new_id_start = max_id + 1
        for i, (_, r) in enumerate(placed_seg):
            new_res = r.copy()
            # Assign a new residue ID
            new_res.id = (' ', new_id_start + i, ' ')
            final_chain.insert(insert_idx, new_res)
            insert_idx += 1

# Write intermediate combined structure
tmp_pdb = "temp_combined.pdb"
io = PDBIO()
io.set_structure(final_structure)
io.save(tmp_pdb)

# Energy minimization
pdb = app.PDBFile(tmp_pdb)
forcefield = app.ForceField('amber14-all.xml', 'amber14/protein.ff14SB.xml')
system = forcefield.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=app.HBonds)
integrator = openmm.LangevinIntegrator(300*unit.kelvin, 1.0/unit.picoseconds, 0.002*unit.picoseconds)
simulation = app.Simulation(pdb.topology, system, integrator)
simulation.context.setPositions(pdb.positions)
simulation.minimizeEnergy(maxIterations=energy_min_steps)
final_positions = simulation.context.getState(getPositions=True).getPositions()

with open(output_pdb, 'w') as f:
    app.PDBFile.writeFile(simulation.topology, final_positions, f)

os.remove(tmp_pdb)