diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 7106c284..bbc7b02c 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -17,6 +17,7 @@ LocalParameters::LocalParameters() : PARAM_MASK_BFACTOR_THRESHOLD(PARAM_MASK_BFACTOR_THRESHOLD_ID,"--mask-bfactor-threshold", "Mask b-factor threshold", "mask residues for seeding if b-factor < thr [0,100]",typeid(float), (void *) &maskBfactorThreshold, "^[0-9]*(\\.[0-9]+)?$"), PARAM_ALIGNMENT_TYPE(PARAM_ALIGNMENT_TYPE_ID,"--alignment-type", "Alignment type", "How to compute the alignment:\n0: 3di alignment\n1: TM alignment\n2: 3Di+AA",typeid(int), (void *) &alignmentType, "^[0-2]{1}$"), PARAM_CHAIN_NAME_MODE(PARAM_CHAIN_NAME_MODE_ID,"--chain-name-mode", "Chain name mode", "Add chain to name:\n0: auto\n1: always add\n",typeid(int), (void *) &chainNameMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), + PARAM_WRITE_MAPPING(PARAM_WRITE_MAPPING_ID, "--write-mapping", "Write mapping file", "write _mapping file containing mapping from internal id to taxonomic identifier", typeid(int), (void *) &writeMapping, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT), PARAM_TMALIGN_FAST(PARAM_TMALIGN_FAST_ID,"--tmalign-fast", "TMalign fast","turn on fast search in TM-align" ,typeid(int), (void *) &tmAlignFast, "^[0-1]{1}$"), PARAM_N_SAMPLE(PARAM_N_SAMPLE_ID, "--n-sample", "Sample size","pick N random sample" ,typeid(int), (void *) &nsample, "^[0-9]{1}[0-9]*$"), PARAM_COORD_STORE_MODE(PARAM_COORD_STORE_MODE_ID, "--coord-store-mode", "Coord store mode", "Coordinate storage mode: \n1: C-alpha as float\n2: C-alpha as difference (uint16_t)", typeid(int), (void *) &coordStoreMode, "^[1-2]{1}$",MMseqsParameter::COMMAND_EXPERT), @@ -62,6 +63,7 @@ LocalParameters::LocalParameters() : substitutionMatrices.emplace_back("3di.out", mat3di_out, mat3di_out_len); // structurecreatedb structurecreatedb.push_back(&PARAM_CHAIN_NAME_MODE); + structurecreatedb.push_back(&PARAM_WRITE_MAPPING); structurecreatedb.push_back(&PARAM_MASK_BFACTOR_THRESHOLD); structurecreatedb.push_back(&PARAM_COORD_STORE_MODE); structurecreatedb.push_back(&PARAM_WRITE_LOOKUP); @@ -154,6 +156,7 @@ LocalParameters::LocalParameters() : minDiagScoreThr = 30; maskBfactorThreshold = 0; chainNameMode = 0; + writeMapping = 0; tmAlignFast = 1; gapOpen = 10; gapExtend = 1; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index a66a7fea..86835440 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -84,6 +84,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_MASK_BFACTOR_THRESHOLD) PARAMETER(PARAM_ALIGNMENT_TYPE) PARAMETER(PARAM_CHAIN_NAME_MODE) + PARAMETER(PARAM_WRITE_MAPPING) PARAMETER(PARAM_TMALIGN_FAST) PARAMETER(PARAM_N_SAMPLE) PARAMETER(PARAM_COORD_STORE_MODE) @@ -97,6 +98,7 @@ class LocalParameters : public Parameters { float maskBfactorThreshold; int alignmentType; int chainNameMode; + bool writeMapping; int tmAlignFast; int nsample; int coordStoreMode; diff --git a/src/strucclustutils/GemmiWrapper.cpp b/src/strucclustutils/GemmiWrapper.cpp index 6af8c7f8..b40122c7 100644 --- a/src/strucclustutils/GemmiWrapper.cpp +++ b/src/strucclustutils/GemmiWrapper.cpp @@ -33,6 +33,34 @@ GemmiWrapper::GemmiWrapper(){ {"UNK",'X'}}; } +std::unordered_map getEntityTaxIDMapping(gemmi::cif::Document& doc) { + std::unordered_map entity_to_taxid; + static const std::vector> loops_with_taxids = { + { "_entity_src_nat.", "?pdbx_ncbi_taxonomy_id"}, + { "_entity_src_gen.", "?pdbx_gene_src_ncbi_taxonomy_id"}, + { "_pdbx_entity_src_syn.", "?ncbi_taxonomy_id"} + }; + for (gemmi::cif::Block& block : doc.blocks) { + for (auto&& [loop, taxid] : loops_with_taxids) { + for (auto row : block.find(loop, {"entity_id", taxid})) { + if (row.has2(1) == false) { + continue; + } + std::string entity_id = gemmi::cif::as_string(row[0]); + if (entity_to_taxid.find(entity_id) != entity_to_taxid.end()) { + continue; + } + const char* endptr = NULL; + int taxId = gemmi::no_sign_atoi(row[1].c_str(), &endptr); + if (endptr != NULL && *endptr == '\0') { + entity_to_taxid.emplace(entity_id, taxId); + } + } + } + } + return entity_to_taxid; +} + bool GemmiWrapper::load(std::string & filename){ if (gemmi::iends_with(filename, ".fcz")) { std::ifstream in(filename, std::ios::binary); @@ -45,9 +73,11 @@ bool GemmiWrapper::load(std::string & filename){ gemmi::MaybeGzipped infile(filename); gemmi::CoorFormat format = gemmi::coor_format_from_ext(infile.basepath()); gemmi::Structure st; + std::unordered_map entity_to_tax_id; switch (format) { case gemmi::CoorFormat::Mmcif: { gemmi::cif::Document doc = gemmi::cif::read(infile); + entity_to_tax_id = getEntityTaxIDMapping(doc); st = gemmi::make_structure(doc); break; } @@ -60,7 +90,7 @@ bool GemmiWrapper::load(std::string & filename){ default: st = gemmi::read_pdb(infile); } - updateStructure((void*) &st, filename); + updateStructure((void*) &st, filename, entity_to_tax_id); } catch (std::runtime_error& e) { return false; } @@ -89,12 +119,14 @@ bool GemmiWrapper::loadFromBuffer(const char * buffer, size_t bufferSize, const gemmi::MaybeGzipped infile(name); gemmi::CoorFormat format = gemmi::coor_format_from_ext(infile.basepath()); gemmi::Structure st; + std::unordered_map entity_to_tax_id; switch (format) { case gemmi::CoorFormat::Pdb: st = gemmi::pdb_impl::read_pdb_from_stream(gemmi::MemoryStream(buffer, bufferSize), name, gemmi::PdbReadOptions()); break; case gemmi::CoorFormat::Mmcif: { gemmi::cif::Document doc = gemmi::cif::read_memory(buffer, bufferSize, name.c_str()); + entity_to_tax_id = getEntityTaxIDMapping(doc); st = gemmi::make_structure(doc); break; } @@ -102,7 +134,7 @@ bool GemmiWrapper::loadFromBuffer(const char * buffer, size_t bufferSize, const case gemmi::CoorFormat::Detect: return false; } - updateStructure((void*) &st, name); + updateStructure((void*) &st, name, entity_to_tax_id); } catch (std::runtime_error& e) { return false; } @@ -190,7 +222,7 @@ bool GemmiWrapper::loadFoldcompStructure(std::istream& stream, const std::string return true; } -void GemmiWrapper::updateStructure(void * void_st, const std::string& filename) { +void GemmiWrapper::updateStructure(void * void_st, const std::string& filename, std::unordered_map& entity_to_tax_id) { gemmi::Structure * st = (gemmi::Structure *) void_st; title.clear(); @@ -203,11 +235,11 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename) cb.clear(); n.clear(); ami.clear(); - title.append( st->get_info("_struct.title")); + taxIds.clear(); + title.append(st->get_info("_struct.title")); size_t currPos = 0; for (gemmi::Model& model : st->models){ for (gemmi::Chain& ch : model.chains) { - size_t chainStartPos = currPos; size_t pos = filename.find_last_of("\\/"); std::string name = (std::string::npos == pos) @@ -216,19 +248,26 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename) //name.push_back('_'); chainNames.push_back(ch.name); names.push_back(name); + int taxId = -1; for (gemmi::Residue &res : ch.residues) { + if (taxId == -1) { + auto it = entity_to_tax_id.find(res.entity_id); + if (it != entity_to_tax_id.end()) { + taxId = it->second; + } + } bool isHetAtomInList = res.het_flag == 'H' && threeAA2oneAA.find(res.name) != threeAA2oneAA.end(); - if(isHetAtomInList == false && res.het_flag != 'A') + if (isHetAtomInList == false && res.het_flag != 'A') continue; - if(isHetAtomInList){ + if (isHetAtomInList) { bool hasCA = false; - for(gemmi::Atom &atom : res.atoms){ - if(atom.name == "CA"){ + for (gemmi::Atom &atom : res.atoms) { + if (atom.name == "CA") { hasCA = true; break; } } - if(hasCA == false){ + if (hasCA == false) { continue; } } @@ -269,6 +308,7 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename) ami.push_back(threeAA2oneAA[res.name]); } } + taxIds.push_back(taxId == -1 ? 0 : taxId); chain.push_back(std::make_pair(chainStartPos, currPos)); } } diff --git a/src/strucclustutils/GemmiWrapper.h b/src/strucclustutils/GemmiWrapper.h index 098f959e..63edb6ce 100644 --- a/src/strucclustutils/GemmiWrapper.h +++ b/src/strucclustutils/GemmiWrapper.h @@ -28,6 +28,7 @@ class GemmiWrapper { std::vector names; std::vector chainNames; std::vector> chain; + std::vector taxIds; std::string title; private: std::unordered_map threeAA2oneAA; @@ -35,7 +36,7 @@ class GemmiWrapper { int chainIt; bool loadFoldcompStructure(std::istream& stream, const std::string& filename); - void updateStructure(void * structure, const std::string & filename); + void updateStructure(void * structure, const std::string & filename, std::unordered_map& entity_to_tax_id); }; diff --git a/src/strucclustutils/structcreatedb.cpp b/src/strucclustutils/structcreatedb.cpp index 6c5c345c..a2fa46ba 100644 --- a/src/strucclustutils/structcreatedb.cpp +++ b/src/strucclustutils/structcreatedb.cpp @@ -14,6 +14,7 @@ #include "microtar.h" #include "PatternCompiler.h" #include "Coordinate16.h" +#include "itoa.h" #include #include @@ -67,16 +68,22 @@ int structure_mtar_gzopen(mtar_t *tar, const char *filename) { } #endif +template +static inline bool compareByFirst(const std::pair& a, const std::pair& b) { + return a.first < b.first; +} + size_t writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, StructureTo3Di & structureTo3Di, PulchraWrapper & pulchra, std::vector & alphabet3di, std::vector & alphabetAA, std::vector & camol, std::string & header, std::string & name, DBWriter & aadbw, DBWriter & hdbw, DBWriter & torsiondbw, DBWriter & cadbw, int chainNameMode, - float maskBfactorThreshold, size_t & tooShort, size_t &globalCnt, int thread_idx, int coordStoreMode, - std::string & filename, size_t &fileidCnt, + float maskBfactorThreshold, size_t & tooShort, size_t & globalCnt, int thread_idx, int coordStoreMode, + std::string & filename, size_t & fileidCnt, std::map & entrynameToFileId, std::map & filenameToFileId, - std::map & fileIdToName) { + std::map & fileIdToName, + DBWriter* mappingWriter) { size_t id = __sync_fetch_and_add(&globalCnt, readStructure.chain.size()); size_t entriesAdded = 0; for(size_t ch = 0; ch < readStructure.chain.size(); ch++){ @@ -154,6 +161,12 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru hdbw.writeData(header.c_str(), header.size(), dbKey, thread_idx); name.clear(); + if (mappingWriter != NULL) { + std::string taxId = SSTR(readStructure.taxIds[ch]); + taxId.append(1, '\n'); + mappingWriter->writeData(taxId.c_str(), taxId.size(), dbKey, thread_idx, false); + } + float* camolf32; if (coordStoreMode == LocalParameters::COORD_STORE_MODE_CA_DIFF) { camol.resize((chainLen - 1) * 3 * sizeof(int16_t) + 3 * sizeof(float) + 1 * sizeof(uint8_t)); @@ -212,8 +225,6 @@ void sortDatafileByIdOrder(DBWriter & dbw, } } - - int createdb(int argc, const char **argv, const Command& command) { LocalParameters& par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_COMMON); @@ -255,6 +266,12 @@ int createdb(int argc, const char **argv, const Command& command) { cadbw.open(); DBWriter aadbw((outputName).c_str(), (outputName+".index").c_str(), static_cast(par.threads), par.compressed, Parameters::DBTYPE_AMINO_ACIDS); aadbw.open(); + DBWriter* mappingWriter = NULL; + if (par.writeMapping) { + mappingWriter = new DBWriter((outputName+"_mapping_tmp").c_str(), (outputName+"_mapping_tmp.index").c_str(), static_cast(par.threads), false, LocalParameters::DBTYPE_OMIT_FILE); + mappingWriter->open(); + } + SubstitutionMatrix mat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); Debug::Progress progress(par.filenames.size()); std::map entrynameToFileId; @@ -316,7 +333,7 @@ int createdb(int argc, const char **argv, const Command& command) { } #endif -#pragma omp parallel default(none) shared(tar, par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName) num_threads(localThreads) reduction(+:incorrectFiles, tooShort) +#pragma omp parallel default(none) shared(tar, par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter) num_threads(localThreads) reduction(+:incorrectFiles, tooShort) { unsigned int thread_idx = 0; #ifdef OPENMP @@ -357,6 +374,7 @@ int createdb(int argc, const char **argv, const Command& command) { bool proceed = true; PatternCompiler include(par.tarInclude.c_str()); PatternCompiler exclude(par.tarExclude.c_str()); + while (proceed) { bool writeEntry = true; #pragma omp critical @@ -445,10 +463,13 @@ int createdb(int argc, const char **argv, const Command& command) { incorrectFiles++; continue; } - writeStructureEntry(mat, readStructure, structureTo3Di, pulchra, - alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, - par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, - name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName); + writeStructureEntry( + mat, readStructure, structureTo3Di, pulchra, + alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, + par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, + name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + mappingWriter + ); } } // end while free(inflateBuffer); @@ -459,7 +480,7 @@ int createdb(int argc, const char **argv, const Command& command) { //===================== single_process ===================//__110710__// -#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, looseFiles, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName) reduction(+:incorrectFiles, tooShort) +#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, looseFiles, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter) reduction(+:incorrectFiles, tooShort) { unsigned int thread_idx = 0; #ifdef OPENMP @@ -475,7 +496,6 @@ int createdb(int argc, const char **argv, const Command& command) { std::string header; std::string name; - #pragma omp for schedule(static) for (size_t i = 0; i < looseFiles.size(); i++) { progress.updateProgress(); @@ -485,12 +505,14 @@ int createdb(int argc, const char **argv, const Command& command) { continue; } // clear memory - writeStructureEntry(mat, readStructure, structureTo3Di, pulchra, - alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, - par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, - looseFiles[i], globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName); + writeStructureEntry( + mat, readStructure, structureTo3Di, pulchra, + alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, + par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, + looseFiles[i], globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + mappingWriter + ); } - } #ifdef HAVE_GCS @@ -512,7 +534,7 @@ int createdb(int argc, const char **argv, const Command& command) { filter = parts[2][0]; } progress.reset(SIZE_MAX); -#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, gcsPaths, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, client, bucket_name, filter) reduction(+:incorrectFiles, tooShort) +#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, gcsPaths, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, client, bucket_name, filter, mappingWriter) reduction(+:incorrectFiles, tooShort) { StructureTo3Di structureTo3Di; PulchraWrapper pulchra; @@ -545,10 +567,13 @@ int createdb(int argc, const char **argv, const Command& command) { if (readStructure.loadFromBuffer(contents.c_str(), contents.size(), obj_name) == false) { incorrectFiles++; } else { - writeStructureEntry(mat, readStructure, structureTo3Di, pulchra, - alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, - par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, - obj_name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName); + writeStructureEntry( + mat, readStructure, structureTo3Di, pulchra, + alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, + par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, + obj_name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + mappingWriter + ); } } } @@ -562,7 +587,7 @@ int createdb(int argc, const char **argv, const Command& command) { DBReader reader(dbs[i].c_str(), (dbs[i]+".index").c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA|DBReader::USE_LOOKUP); reader.open(DBReader::LINEAR_ACCCESS); progress.reset(reader.getSize()); -#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, reader) reduction(+:incorrectFiles, tooShort) +#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, reader, mappingWriter) reduction(+:incorrectFiles, tooShort) { StructureTo3Di structureTo3Di; PulchraWrapper pulchra; @@ -592,10 +617,13 @@ int createdb(int argc, const char **argv, const Command& command) { if (readStructure.loadFromBuffer(data, len, name) == false) { incorrectFiles++; } else { - writeStructureEntry(mat, readStructure, structureTo3Di, pulchra, - alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, - par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, - dbname, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName); + writeStructureEntry( + mat, readStructure, structureTo3Di, pulchra, + alphabet3di, alphabetAA, camol, header, name, aadbw, hdbw, torsiondbw, cadbw, + par.chainNameMode, par.maskBfactorThreshold, tooShort, globalCnt, thread_idx, par.coordStoreMode, + dbname, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + mappingWriter + ); } } } @@ -606,9 +634,12 @@ int createdb(int argc, const char **argv, const Command& command) { hdbw.close(true); cadbw.close(true); aadbw.close(true); + if (par.writeMapping) { + mappingWriter->close(true); + delete mappingWriter; + } - - if(needsReorderingAtTheEnd) { + if (needsReorderingAtTheEnd) { Debug(Debug::INFO) << "Reordering by identifier\n"; DBReader header_reorder((outputName+"_h").c_str(), (outputName+"_h.index").c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); header_reorder.open(DBReader::NOSORT); @@ -644,7 +675,10 @@ int createdb(int argc, const char **argv, const Command& command) { } buffer.clear(); } - fclose(file); + if (fclose(file) != 0) { + Debug(Debug::ERROR) << "Cannot close lookup file " << lookupFile << "\n"; + EXIT(EXIT_FAILURE); + } DBWriter hdbw_reorder((outputName+"_h").c_str(), (outputName+"_h.index").c_str(), static_cast(par.threads), par.compressed, Parameters::DBTYPE_GENERIC_DB); hdbw_reorder.open(); @@ -678,11 +712,57 @@ int createdb(int argc, const char **argv, const Command& command) { sortDatafileByIdOrder(aadbw_reorder, aadbr_reorder, mappingOrder); aadbw_reorder.close(true); aadbr_reorder.close(); - }else{ + + if (par.writeMapping) { + DBReader mappingReader_reorder((outputName+"_mapping_tmp").c_str(), (outputName+"_mapping_tmp.index").c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + mappingReader_reorder.open(DBReader::NOSORT); + mappingReader_reorder.readMmapedDataInMemory(); + DBWriter mappingWriter_reorder((outputName+"_mapping_tmp").c_str(), (outputName+"_mapping_tmp.index").c_str(), static_cast(par.threads), par.compressed, LocalParameters::DBTYPE_CA_ALPHA); + mappingWriter_reorder.open(); + sortDatafileByIdOrder(mappingWriter_reorder, mappingReader_reorder, mappingOrder); + mappingWriter_reorder.close(true); + mappingReader_reorder.close(); + } + } else { DBWriter::createRenumberedDB((outputName+"_ss").c_str(), (outputName+"_ss.index").c_str(), "", "", DBReader::LINEAR_ACCCESS); DBWriter::createRenumberedDB((outputName+"_h").c_str(), (outputName+"_h.index").c_str(), "", "", DBReader::LINEAR_ACCCESS); DBWriter::createRenumberedDB((outputName+"_ca").c_str(), (outputName+"_ca.index").c_str(), "", "", DBReader::LINEAR_ACCCESS); DBWriter::createRenumberedDB((outputName).c_str(), (outputName+".index").c_str(), "", "", DBReader::LINEAR_ACCCESS); + if (par.writeMapping) { + DBWriter::createRenumberedDB((outputName+"_mapping_tmp").c_str(), (outputName+"_mapping_tmp.index").c_str(), "", "", DBReader::LINEAR_ACCCESS); + } + } + + if (par.writeMapping) { + std::string mappingFile = outputName + "_mapping"; + FILE* file = FileUtil::openAndDelete(mappingFile.c_str(), "w"); + char buffer[1024]; + DBReader mappingReader((outputName+"_mapping_tmp").c_str(), (outputName+"_mapping_tmp.index").c_str(), 1, DBReader::USE_INDEX|DBReader::USE_DATA); + mappingReader.open(DBReader::NOSORT); + for (size_t i = 0; i < mappingReader.getSize(); ++i) { + unsigned int key = mappingReader.getDbKey(i); + char* data = mappingReader.getData(i, 0); + size_t entryLength = mappingReader.getEntryLen(i) - 1; + char* basePos = buffer; + char* tmpBuff = Itoa::u32toa_sse2(static_cast(key), buffer); + *(tmpBuff-1) = '\t'; + strncpy(tmpBuff, data, entryLength); + tmpBuff = tmpBuff + entryLength; + *(tmpBuff) = '\n'; + size_t length = tmpBuff - basePos + 1; + size_t written = fwrite(buffer, sizeof(char), length, file); + Debug(Debug::ERROR) << "Writing " << std::string(buffer, length) << "\n"; + if (written != length) { + Debug(Debug::ERROR) << "Error writing to file " << mappingFile << "\n"; + EXIT(EXIT_FAILURE); + } + } + mappingReader.close(); + if (fclose(file) != 0) { + Debug(Debug::ERROR) << "Cannot close file " << mappingFile << "\n"; + EXIT(EXIT_FAILURE); + } + DBReader::removeDb(outputName + "_mapping_tmp"); } if (par.writeLookup == true) { @@ -690,8 +770,6 @@ int createdb(int argc, const char **argv, const Command& command) { 1, DBReader::USE_DATA | DBReader::USE_INDEX); readerHeader.open(DBReader::NOSORT); // create lookup file - std::string sourceFilename = outputName + ".source"; - FILE *sourceFile = FileUtil::openAndDelete(sourceFilename.c_str(), "w"); std::string lookupFile = outputName + ".lookup"; FILE *file = FileUtil::openAndDelete(lookupFile.c_str(), "w"); std::string buffer; @@ -713,7 +791,14 @@ int createdb(int argc, const char **argv, const Command& command) { } buffer.clear(); } + if (fclose(file) != 0) { + Debug(Debug::ERROR) << "Cannot close lookup file " << lookupFile << "\n"; + EXIT(EXIT_FAILURE); + } readerHeader.close(); + + std::string sourceFilename = outputName + ".source"; + FILE *sourceFile = FileUtil::openAndDelete(sourceFilename.c_str(), "w"); for(unsigned int fileId = 0; fileId <= maxFileId; fileId++) { std::string filename = fileIdToName[fileId]; buffer.append(SSTR(fileId)); @@ -727,6 +812,10 @@ int createdb(int argc, const char **argv, const Command& command) { } buffer.clear(); } + if (fclose(sourceFile) != 0) { + Debug(Debug::ERROR) << "Cannot close source file " << sourceFilename << "\n"; + EXIT(EXIT_FAILURE); + } }