Source code for developer.align_mapping

#!/usr/bin/env python3
"""
A helper program to add SMILEIDX to mapping files.

This program align MD molecules from the Universe to an rdkit molecule
created by SMILES. Can be used with a care to semi-manualy curate mapping
files, SMILES, and add the alignments for only heavy atoms. Further should
be converted into a mature CI/CD script.
"""

import argparse
import os
import re
import sys
from copy import deepcopy

import MDAnalysis as mda
import numpy as np
import yaml
from rdkit import Chem, RDLogger
from rdkit.Chem import MolStandardize

import fairmd.lipids.api as dlapi
import fairmd.lipids.core as dlc
import fairmd.lipids.molecules as dlm
from fairmd.lipids.auxiliary import mollib

lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)


[docs] def get_1mol_selstr(comp_name: str, mol_obj: dlm.Molecule) -> str: """Return selection string for a single molecule""" res_set = set() try: for atom in mol_obj.mapping_dict: res_set.add(mol_obj.mapping_dict[atom]["RESIDUE"]) except (KeyError, TypeError): res_set = {comp_name} return "resname " + " or resname ".join(sorted(res_set))
[docs] def get_brutto_formula(eorder: str, agrp: mda.AtomGroup, charge: float = 0) -> str: """Get brutto formula (according to element order) of neutralized form""" ans = "" for e in eorder: ans += e n_ = (agrp.atoms.elements == e).sum() if e == "H" and charge < 0: n_ -= int(charge) ans += "" if n_ == 1 else str(n_) return ans
[docs] def compare_neutralized(a: Chem.rdchem.Mol, b: Chem.rdchem.Mol) -> bool: """Compare neutralized forms of molecules""" a_ = MolStandardize.rdMolStandardize.ChargeParent(a) b_ = MolStandardize.rdMolStandardize.ChargeParent(b) aib = a_.HasSubstructMatch(b_) bia = b_.HasSubstructMatch(a_) return aib and bia
DONE_LOG_FNAME = "align-mapping-file.log"
[docs] def main(): parser = argparse.ArgumentParser() parser.add_argument( "--skip-ids", type=lambda s: [int(x) for x in s.split(",")], default=[], help="Comma-separated list of IDs to skip", ) args = parser.parse_args() ss = dlc.initialize_databank() done_ids = [] if os.path.isfile(DONE_LOG_FNAME): with open(DONE_LOG_FNAME) as fd: done_ids = list(map(int, fd.readlines())) done_ids.extend(args.skip_ids) # add defective systems print("Loading current state..") print(done_ids) for s in ss: print(s) if s["ID"] in done_ids: print(" -- Already done. Skipping.") continue if "TPR" not in s or s["TPR"] is None: print(" -- non-TPR currently not implemented. Skipping.") continue if s.get("UNITEDATOM_DICT", False): print(" -- Cannot work with UA") continue print("LIPID COMPOSITION: ", s.lipids.keys()) u: mda.Universe | None = None for cur_lip, mol_obj in s.lipids.items(): print("Current lipids ", cur_lip) smileids = [x["SMILEIDX"] for x in mol_obj.mapping_dict.values() if "SMILEIDX" in x] # head dict if smileids: print(f" -- SMILEIDX already present ({len(smileids)}). Skipping.") continue print("START WORKING -- ", s) if u is None: uc = dlapi.UniverseConstructor(s) uc.download_mddata(skip_traj=True) u = uc.build_universe() # use internal element guesser u.guess_TopologyAttrs(force_guess=["elements"]) mollib.guess_elements(s, u) # select all molecules in the system sel_str = get_1mol_selstr(s["COMPOSITION"][cur_lip]["NAME"], mol_obj) print(sel_str) all_atoms_of_mol = u.select_atoms(sel_str) print(all_atoms_of_mol) # start checking the consistency metadata_bformula = ( mol_obj.metadata["bioschema_properties"]["molecularFormula"].replace("+", "").replace("-", "") ) eorder = re.sub(r"\d+", "", metadata_bformula) metadata_charge = float(mol_obj.metadata["NMRlipids"]["charge"]) metadata_mweight = float(mol_obj.metadata["bioschema_properties"]["molecularWeight"]) + metadata_charge smiles = mol_obj.metadata["bioschema_properties"]["smiles"] mol_from_smiles = Chem.MolFromSmiles(smiles) molecules = all_atoms_of_mol.groupby("molnums") last_good_mol = None for _, mol_ in molecules.items(): if u.atoms.select_atoms(f"molnum {_}") != mol_: continue try: mol_from_md = mol_.atoms.convert_to("rdkit") last_good_mol = mol_ except (Chem.AtomValenceException, KeyError): print(f"Molecule {_} has bad conformation. Trying another one.", file=sys.stderr) last_good_mol = None continue print(".", end="", flush=True) mass_close = np.isclose(mol_.masses.sum(), metadata_mweight, atol=0.1) if not mass_close: # it can be because of incorrect average isotopic mass print( f"Masses do not correspond to each other: {_} {mol_.masses.sum()} / {metadata_mweight}.", file=sys.stderr, ) cur_brutto = get_brutto_formula(eorder, mol_, metadata_charge) assert cur_brutto == metadata_bformula, ( f"Brutto formulas do not correspond to each other {cur_brutto} / {metadata_bformula}" ) assert compare_neutralized(mol_from_smiles, mol_from_md), "SMILES != MD" # check-ups are done # new mapping new_mapping_dict = deepcopy(mol_obj.mapping_dict) # make no-explicit-H-mol and check match-to-smiles mm = Chem.RemoveHs(mol_from_md) mtch = mm.GetSubstructMatch(mol_from_smiles) assert not (set(range(mol_from_smiles.GetNumHeavyAtoms())) - set(mtch)), ( "Match string should be a permutation of [1,...,N] where N is the number of heavy atoms" ) rdatit = mol_from_md.GetAtoms() rdatit2 = mm.GetAtoms() mtch_iter = iter(np.argsort(mtch)) for mdat in last_good_mol.atoms: a = next(rdatit) if a.GetAtomicNum() == 1: print(a.GetAtomicNum(), mdat.name) else: b = next(rdatit2) match_in_smile = next(mtch_iter) un = mol_obj.md2uan(mdat.name, mdat.resname) print(un, a.GetAtomicNum(), mdat.name, b.GetAtomicNum(), match_in_smile) new_mapping_dict[un]["SMILEIDX"] = int(match_in_smile) we_have_idx = False for k, v in new_mapping_dict.items(): if "SMILEIDX" in v: if not we_have_idx and "SMILEIDX" in mol_obj.mapping_dict[k]: we_have_idx = True if we_have_idx: assert "SMILEIDX" in mol_obj.mapping_dict[k], "SMILEIDX is set, but not for this atom!" assert mol_obj.mapping_dict[k]["SMILEIDX"] == v["SMILEIDX"], "SMILEIDX is wrong!!" if not we_have_idx: print("Saving mapping with SMILE indexes for the first time!") with open(mol_obj._mapping_fpath, "w") as fd: fd.write(yaml.safe_dump(new_mapping_dict, sort_keys=False, indent=1)) with open(DONE_LOG_FNAME, "a") as fd: fd.write("%d\n" % s["ID"])
if __name__ == "__main__": main()