Source code for developer.AutocompleteMetadata

"""
Script for membrane metadata autocomplete.

This script will try to fill further attributes in a
:ref:`membrane metadata file <addnewmol>` based on the information
queried with the inchikey from:
- UniChem
- ChEMBL
- ChEBI
- PubChem

.. note::
   This file is meant to be used by automated workflows.
"""

import json
import os
import sys
import urllib.request

import yaml


[docs] def check_api(url): try: with urllib.request.urlopen(url, timeout=5) as response: return response.status == 200 except Exception: return False
[docs] def get_chembl(inchikey): url = f"https://www.ebi.ac.uk/chembl/api/data/molecule?standard_inchi_key={inchikey}&format=json" if check_api(url): try: with urllib.request.urlopen(url) as response: return json.loads(response.read().decode('utf-8')) if response.status == 200 else {} except Exception: return {} return {}
[docs] def get_pubchem(inchikey): url = ( f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/" f"{inchikey}/property/IUPACName,SMILES,InChI,InChIKey,MolecularFormula,MolecularWeight/JSON" ) if check_api(url): try: with urllib.request.urlopen(url) as response: if response.status == 200: return json.loads(response.read().decode('utf-8'))["PropertyTable"]["Properties"][0] except Exception: pass return {}
[docs] def get_pubchem_synonyms(cid): url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/synonyms/JSON" if check_api(url): try: with urllib.request.urlopen(url) as response: if response.status == 200: return json.loads(response.read().decode('utf-8')).get("InformationList", {}).get("Information", [{}])[0].get("Synonym", []) except Exception: pass return []
[docs] def get_chebi(chebi_id): if not chebi_id: return {} url = f"https://www.ebi.ac.uk/chebi/backend/api/public/compound/{chebi_id}/?only_ontology_parents=false&only_ontology_children=false" try: if check_api(url): with urllib.request.urlopen(url) as response: if response.status == 200: return json.loads(response.read().decode('utf-8')) except Exception: pass return {}
[docs] def get_unichem(inchikey): url = "https://www.ebi.ac.uk/unichem/api/v1/compounds" if check_api("https://www.ebi.ac.uk/unichem/api/v1/sources"): try: data = json.dumps({"type": "inchikey", "compound": inchikey}).encode('utf-8') headers = {'Content-Type': 'application/json'} req = urllib.request.Request(url, data=data, headers=headers, method='POST') with urllib.request.urlopen(req) as response: if response.status == 200: compounds = json.loads(response.read().decode('utf-8')).get("compounds", []) if compounds and "sources" in compounds[0]: return compounds[0]["sources"] except Exception: pass return []
[docs] def extract_sameas(sources): mapping = { "pubchem": "pubchem.compound", "chebi": "ChEBI", "chembl": "ChEMBL", "lipidmaps": "lipidmaps", "metabolights": "metabolights", "swisslipids": "slm", "pdb": "pdb.ligand", "unii": "unii", "cas": "cas" } result = {} for src in sources: prefix = mapping.get(src["shortName"]) if prefix: value = src["compoundId"] if prefix == "ChEBI": value = f"CHEBI:{value}" if value else "" elif prefix == "pubchem.compound": try: value = int(value) except ValueError: pass result[prefix] = value return result
[docs] def get_chembl_id_from_unichem(sources): for src in sources: if src["shortName"] == "chembl": return src["compoundId"] return None
[docs] def load_existing_metadata(path): if os.path.exists(path): with open(path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) return {}
[docs] def update_metadata(existing, new_data): for key, value in new_data.items(): if isinstance(value, dict): updated = update_metadata(existing.get(key, {}), value) if updated: existing[key] = updated elif isinstance(value, list): if value: existing[key] = existing.get(key, []) or value else: if value not in [None, "", {}]: existing[key] = existing.get(key) or value return existing
[docs] def main(): if len(sys.argv) != 2: print("Usage: python script.py <metadata.yaml path>") sys.exit(1) metadata_path = sys.argv[1] # Extract NMRlipidsID from path (assumes structure: Molecules/membrane/<NMRlipidsID>/metadata.yaml) try: nmr_id = os.path.basename(os.path.dirname(metadata_path)) except Exception: print("Error: Could not extract NMRlipidsID from path.") sys.exit(1) existing = load_existing_metadata(metadata_path) try: inchikey = existing["bioschema_properties"]["inChIKey"] except Exception: print("Error: Could not find bioschema_properties -> inChIKey in YAML file.") sys.exit(1) chembl = get_chembl(inchikey) pubchem = get_pubchem(inchikey) sources = get_unichem(inchikey) sameas = extract_sameas(sources) cid = pubchem.get("CID", sameas.get("pubchem.compound")) synonyms = get_pubchem_synonyms(cid) if cid else [] # First, check if there's a ChEBI ID from unichem chebi_id = sameas.get("ChEBI", "").replace("CHEBI:", "") chebi_data = get_chebi(chebi_id) if chebi_id else {} # Collect alternate names with priority alternate_names = [] # 1. Try ChEBI synonyms first if chebi_data and 'names' in chebi_data: # Extract only the 'name' from SYNONYM type alternate_names = [ syn['name'] for syn in chebi_data.get('names', {}).get('SYNONYM', []) if syn.get('type') == 'SYNONYM' and syn.get('name') ] # 2. If no ChEBI synonyms, try ChEMBL synonyms if not alternate_names and chembl.get("molecule_synonyms"): alternate_names = [ syn.get('molecule_synonym', '') for syn in chembl.get("molecule_synonyms", []) ] # 3. If still no synonyms, try PubChem synonyms if not alternate_names and synonyms: alternate_names = synonyms molecule_props = chembl.get("molecule_properties", {}) molecule_structures = chembl.get("molecule_structures", {}) chembl_id = get_chembl_id_from_unichem(sources) # Image selection logic if chembl_id: image_url = f"https://www.ebi.ac.uk/chembl/api/data/image/{chembl_id}?dimensions=200" elif cid: image_url = f"https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid={cid}&t=l" else: image_url = "" bioschema = { "name": molecule_props.get("iupac_name") or pubchem.get("IUPACName", ""), "iupacName": molecule_props.get("iupac_name") or pubchem.get("IUPACName", ""), "molecularFormula": molecule_props.get("full_molformula") or pubchem.get("MolecularFormula", ""), "molecularWeight": float(molecule_props.get("full_mwt") or pubchem.get("MolecularWeight", 0)), "inChI": molecule_structures.get("standard_inchi") or pubchem.get("InChI", ""), "inChIKey": molecule_structures.get("standard_inchi_key") or pubchem.get("InChIKey", ""), "smiles": molecule_structures.get("canonical_smiles") or pubchem.get("SMILES", ""), "image": image_url, "description": "" } if alternate_names: bioschema["alternateName"] = alternate_names new_data = { "NMRlipids": { "id": nmr_id, "name": "", "charge": "" }, "sameAs": sameas, "bioschema_properties": bioschema } updated = update_metadata(existing, new_data) with open(metadata_path, 'w', encoding='utf-8') as f: yaml.dump(updated, f, sort_keys=False, allow_unicode=True, default_flow_style=False) print(f"Updated metadata written to {metadata_path}")
if __name__ == "__main__": main()