Skip to content

Commit

Permalink
Create _mapping file during createdb for mmCIF input
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed May 8, 2023
1 parent 34a35bb commit ea587d0
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 44 deletions.
3 changes: 3 additions & 0 deletions src/commons/LocalParameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ LocalParameters::LocalParameters() :
PARAM_MASK_BFACTOR_THRESHOLD(PARAM_MASK_BFACTOR_THRESHOLD_ID,"--mask-bfactor-threshold", "Mask b-factor threshold", "mask residues for seeding if b-factor < thr [0,100]",typeid(float), (void *) &maskBfactorThreshold, "^[0-9]*(\\.[0-9]+)?$"),
PARAM_ALIGNMENT_TYPE(PARAM_ALIGNMENT_TYPE_ID,"--alignment-type", "Alignment type", "How to compute the alignment:\n0: 3di alignment\n1: TM alignment\n2: 3Di+AA",typeid(int), (void *) &alignmentType, "^[0-2]{1}$"),
PARAM_CHAIN_NAME_MODE(PARAM_CHAIN_NAME_MODE_ID,"--chain-name-mode", "Chain name mode", "Add chain to name:\n0: auto\n1: always add\n",typeid(int), (void *) &chainNameMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
PARAM_WRITE_MAPPING(PARAM_WRITE_MAPPING_ID, "--write-mapping", "Write mapping file", "write _mapping file containing mapping from internal id to taxonomic identifier", typeid(int), (void *) &writeMapping, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
PARAM_TMALIGN_FAST(PARAM_TMALIGN_FAST_ID,"--tmalign-fast", "TMalign fast","turn on fast search in TM-align" ,typeid(int), (void *) &tmAlignFast, "^[0-1]{1}$"),
PARAM_N_SAMPLE(PARAM_N_SAMPLE_ID, "--n-sample", "Sample size","pick N random sample" ,typeid(int), (void *) &nsample, "^[0-9]{1}[0-9]*$"),
PARAM_COORD_STORE_MODE(PARAM_COORD_STORE_MODE_ID, "--coord-store-mode", "Coord store mode", "Coordinate storage mode: \n1: C-alpha as float\n2: C-alpha as difference (uint16_t)", typeid(int), (void *) &coordStoreMode, "^[1-2]{1}$",MMseqsParameter::COMMAND_EXPERT),
Expand Down Expand Up @@ -62,6 +63,7 @@ LocalParameters::LocalParameters() :
substitutionMatrices.emplace_back("3di.out", mat3di_out, mat3di_out_len);
// structurecreatedb
structurecreatedb.push_back(&PARAM_CHAIN_NAME_MODE);
structurecreatedb.push_back(&PARAM_WRITE_MAPPING);
structurecreatedb.push_back(&PARAM_MASK_BFACTOR_THRESHOLD);
structurecreatedb.push_back(&PARAM_COORD_STORE_MODE);
structurecreatedb.push_back(&PARAM_WRITE_LOOKUP);
Expand Down Expand Up @@ -154,6 +156,7 @@ LocalParameters::LocalParameters() :
minDiagScoreThr = 30;
maskBfactorThreshold = 0;
chainNameMode = 0;
writeMapping = 0;
tmAlignFast = 1;
gapOpen = 10;
gapExtend = 1;
Expand Down
2 changes: 2 additions & 0 deletions src/commons/LocalParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class LocalParameters : public Parameters {
PARAMETER(PARAM_MASK_BFACTOR_THRESHOLD)
PARAMETER(PARAM_ALIGNMENT_TYPE)
PARAMETER(PARAM_CHAIN_NAME_MODE)
PARAMETER(PARAM_WRITE_MAPPING)
PARAMETER(PARAM_TMALIGN_FAST)
PARAMETER(PARAM_N_SAMPLE)
PARAMETER(PARAM_COORD_STORE_MODE)
Expand All @@ -97,6 +98,7 @@ class LocalParameters : public Parameters {
float maskBfactorThreshold;
int alignmentType;
int chainNameMode;
bool writeMapping;
int tmAlignFast;
int nsample;
int coordStoreMode;
Expand Down
60 changes: 50 additions & 10 deletions src/strucclustutils/GemmiWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,34 @@ GemmiWrapper::GemmiWrapper(){
{"UNK",'X'}};
}

std::unordered_map<std::string, int> getEntityTaxIDMapping(gemmi::cif::Document& doc) {
std::unordered_map<std::string, int> entity_to_taxid;
static const std::vector<std::pair<std::string, std::string>> loops_with_taxids = {
{ "_entity_src_nat.", "?pdbx_ncbi_taxonomy_id"},
{ "_entity_src_gen.", "?pdbx_gene_src_ncbi_taxonomy_id"},
{ "_pdbx_entity_src_syn.", "?ncbi_taxonomy_id"}
};
for (gemmi::cif::Block& block : doc.blocks) {
for (auto&& [loop, taxid] : loops_with_taxids) {
for (auto row : block.find(loop, {"entity_id", taxid})) {
if (row.has2(1) == false) {
continue;
}
std::string entity_id = gemmi::cif::as_string(row[0]);
if (entity_to_taxid.find(entity_id) != entity_to_taxid.end()) {
continue;
}
const char* endptr = NULL;
int taxId = gemmi::no_sign_atoi(row[1].c_str(), &endptr);
if (endptr != NULL && *endptr == '\0') {
entity_to_taxid.emplace(entity_id, taxId);
}
}
}
}
return entity_to_taxid;
}

bool GemmiWrapper::load(std::string & filename){
if (gemmi::iends_with(filename, ".fcz")) {
std::ifstream in(filename, std::ios::binary);
Expand All @@ -45,9 +73,11 @@ bool GemmiWrapper::load(std::string & filename){
gemmi::MaybeGzipped infile(filename);
gemmi::CoorFormat format = gemmi::coor_format_from_ext(infile.basepath());
gemmi::Structure st;
std::unordered_map<std::string, int> entity_to_tax_id;
switch (format) {
case gemmi::CoorFormat::Mmcif: {
gemmi::cif::Document doc = gemmi::cif::read(infile);
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure(doc);
break;
}
Expand All @@ -60,7 +90,7 @@ bool GemmiWrapper::load(std::string & filename){
default:
st = gemmi::read_pdb(infile);
}
updateStructure((void*) &st, filename);
updateStructure((void*) &st, filename, entity_to_tax_id);
} catch (std::runtime_error& e) {
return false;
}
Expand Down Expand Up @@ -89,20 +119,22 @@ bool GemmiWrapper::loadFromBuffer(const char * buffer, size_t bufferSize, const
gemmi::MaybeGzipped infile(name);
gemmi::CoorFormat format = gemmi::coor_format_from_ext(infile.basepath());
gemmi::Structure st;
std::unordered_map<std::string, int> entity_to_tax_id;
switch (format) {
case gemmi::CoorFormat::Pdb:
st = gemmi::pdb_impl::read_pdb_from_stream(gemmi::MemoryStream(buffer, bufferSize), name, gemmi::PdbReadOptions());
break;
case gemmi::CoorFormat::Mmcif: {
gemmi::cif::Document doc = gemmi::cif::read_memory(buffer, bufferSize, name.c_str());
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure(doc);
break;
}
case gemmi::CoorFormat::Unknown:
case gemmi::CoorFormat::Detect:
return false;
}
updateStructure((void*) &st, name);
updateStructure((void*) &st, name, entity_to_tax_id);
} catch (std::runtime_error& e) {
return false;
}
Expand Down Expand Up @@ -190,7 +222,7 @@ bool GemmiWrapper::loadFoldcompStructure(std::istream& stream, const std::string
return true;
}

void GemmiWrapper::updateStructure(void * void_st, const std::string& filename) {
void GemmiWrapper::updateStructure(void * void_st, const std::string& filename, std::unordered_map<std::string, int>& entity_to_tax_id) {
gemmi::Structure * st = (gemmi::Structure *) void_st;

title.clear();
Expand All @@ -203,11 +235,11 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename)
cb.clear();
n.clear();
ami.clear();
title.append( st->get_info("_struct.title"));
taxIds.clear();
title.append(st->get_info("_struct.title"));
size_t currPos = 0;
for (gemmi::Model& model : st->models){
for (gemmi::Chain& ch : model.chains) {

size_t chainStartPos = currPos;
size_t pos = filename.find_last_of("\\/");
std::string name = (std::string::npos == pos)
Expand All @@ -216,19 +248,26 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename)
//name.push_back('_');
chainNames.push_back(ch.name);
names.push_back(name);
int taxId = -1;
for (gemmi::Residue &res : ch.residues) {
if (taxId == -1) {
auto it = entity_to_tax_id.find(res.entity_id);
if (it != entity_to_tax_id.end()) {
taxId = it->second;
}
}
bool isHetAtomInList = res.het_flag == 'H' && threeAA2oneAA.find(res.name) != threeAA2oneAA.end();
if(isHetAtomInList == false && res.het_flag != 'A')
if (isHetAtomInList == false && res.het_flag != 'A')
continue;
if(isHetAtomInList){
if (isHetAtomInList) {
bool hasCA = false;
for(gemmi::Atom &atom : res.atoms){
if(atom.name == "CA"){
for (gemmi::Atom &atom : res.atoms) {
if (atom.name == "CA") {
hasCA = true;
break;
}
}
if(hasCA == false){
if (hasCA == false) {
continue;
}
}
Expand Down Expand Up @@ -269,6 +308,7 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename)
ami.push_back(threeAA2oneAA[res.name]);
}
}
taxIds.push_back(taxId == -1 ? 0 : taxId);
chain.push_back(std::make_pair(chainStartPos, currPos));
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/strucclustutils/GemmiWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ class GemmiWrapper {
std::vector<std::string> names;
std::vector<std::string> chainNames;
std::vector<std::pair<size_t ,size_t>> chain;
std::vector<int> taxIds;
std::string title;
private:
std::unordered_map<std::string,char> threeAA2oneAA;
int modelIt;
int chainIt;

bool loadFoldcompStructure(std::istream& stream, const std::string& filename);
void updateStructure(void * structure, const std::string & filename);
void updateStructure(void * structure, const std::string & filename, std::unordered_map<std::string, int>& entity_to_tax_id);
};


Expand Down
Loading

0 comments on commit ea587d0

Please sign in to comment.