Source code for orsopy.slddb.importers

"""
Functions to create database compatible entries from other file formats.
"""
import os
import pathlib

from .dbconfig import db_lookup
from .material import Formula, PolymerSequence


[docs] class Importer(dict): """ Base class for importing database entries. Includes checks for correctness used by all importers. """ formula = None def __init__(self, filename, validate=True): self.filename = filename self.name = os.path.basename(filename).rsplit(".", 1)[0] data = self.build_data() if validate: self.validate(name=self.name, formula=self.formula, **data) dict.__init__(self, data)
[docs] @staticmethod def validate(**full_entry): # check for all values of the data dictionary if the format is valid for key, value in full_entry.items(): if not db_lookup[key][1].validate(value): raise ValueError(f"Can not import dataset, failed to validate value '{value}' for key '{key}'")
[docs] def build_data(self): raise NotImplementedError("Importer has to be subclassed with _build_data implemented.")
def __repr__(self): return f'MaterialData(name="{self.name}", formula={repr(self.formula)} , data={dict.__repr__(self)})'
[docs] class CifImporter(Importer): suffix = "cif" def __init__(self, filename, validate=True, sequence=1): self.sequence = sequence super().__init__(filename, validate=validate)
[docs] @staticmethod def float_werr(value): # Convert CIF entry that might have an uncertainty to float return float(value.split("(")[0])
[docs] def build_data(self): try: import CifFile except ImportError: raise RuntimeError("You have to install PyCifRW python package to be able to import cif files") output = {"data_origin": "diffraction", "comments": "imported from CIF file", "physical_state": "solid"} cf = CifFile.ReadCif(pathlib.Path(self.filename).as_uri()) block = cf.first_block() if "_chemical_formula_sum" in block: formula = Formula(block["_chemical_formula_sum"]) elif "_entity_poly.pdbx_seq_one_letter_code" in block: txt = block["_entity_poly.pdbx_seq_one_letter_code"] if type(txt) is list: formula = PolymerSequence(txt[self.sequence - 1]) else: formula = PolymerSequence(txt) output["tags"] = ["biology", "polymer"] output["reference"] = "Protein Data Bank (PDB)" output["ref_website"] = "https://www.rcsb.org/" if "_citation.pdbx_database_id_DOI" in block: output["doi"] = block["_citation.pdbx_database_id_DOI"][0] else: raise ValueError("Could not locate chemical formula or one letter PDB sequence") if "_exptl_crystal_density_diffrn" in block: output["density"] = self.float_werr(block["_exptl_crystal_density_diffrn"]) elif "_cell_volume" in block and "_cell_formula_units_Z" in block: output["FU_volume"] = self.float_werr(block["_cell_volume"]) / self.float_werr( block["_cell_formula_units_Z"] ) # ų elif "_entity_poly.pdbx_seq_one_letter_code" in block: # will use database FU_volume to deduce polymer density pass else: raise ValueError("No data to deduce material density") if "_chemical_name_mineral" in block: self.name = block["_chemical_name_mineral"] if all([ii in block for ii in ["_journal_name_full", "_journal_volume", "_journal_year", "_publ_author_name"]]): authors = ", ".join(block["_publ_author_name"]) journal = block["_journal_name_full"] volume = block["_journal_volume"] year = block["_journal_year"] if "_journal_page_first" in block: page = block["_journal_page_first"] else: page = "-" output["reference"] = f"{authors}; {journal}, {volume}, p. {page} ({year})".replace("\n", " ") if "_journal_paper_doi" in block: output["doi"] = "https://doi.org/" + block["_journal_paper_doi"] self.formula = formula return output
importers = [CifImporter]