diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index cace6e1cb..832d1db40 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -70,6 +70,7 @@ from oaklib.datamodels.text_annotator import TextAnnotationConfiguration from oaklib.datamodels.validation_datamodel import ValidationConfiguration from oaklib.datamodels.vocabulary import ( + DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN, DEVELOPS_FROM, EQUIVALENT_CLASS, HAS_OBO_NAMESPACE, @@ -2649,6 +2650,11 @@ def similarity_pair(terms, predicates, autolabel: bool, output: TextIO, output_t type=float, help="Minimum value for information content", ) +@click.option( + "--embeddings-file", + type=click.File(mode="r"), + help="file containing embeddings of all necessary nodes.", +) @click.option("-o", "--output", help="path to output") @click.option( "--main-score-field", @@ -2674,6 +2680,7 @@ def similarity( low_memory: bool, min_jaccard_similarity: Optional[float], min_ancestor_information_content: Optional[float], + embeddings_file: TextIO, main_score_field, output_type, output, @@ -2773,10 +2780,13 @@ def similarity( predicates=actual_predicates, min_jaccard_similarity=min_jaccard_similarity, min_ancestor_information_content=min_ancestor_information_content, + embeddings_file=embeddings_file, outfile=output, ) # Read the output file line by line and store the contents in a list + if output is None: + output = DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN with open(output, "r") as f: lines = f.readlines() diff --git a/src/oaklib/datamodels/vocabulary.py b/src/oaklib/datamodels/vocabulary.py index e21148a0c..4bdae7f6d 100644 --- a/src/oaklib/datamodels/vocabulary.py +++ b/src/oaklib/datamodels/vocabulary.py @@ -161,6 +161,8 @@ OWL_VERSION_INFO = "owl:versionInfo" OWL_VERSION_IRI = "owl:versionIRI" +DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN = "similarity_map.tsv" + class SEMAPV(Enum): """SEMAPV Enum containing different mapping_justification.""" diff --git a/src/oaklib/implementations/semsimian/semsimian_implementation.py b/src/oaklib/implementations/semsimian/semsimian_implementation.py index f9ab0cfe7..965f8ff11 100644 --- a/src/oaklib/implementations/semsimian/semsimian_implementation.py +++ b/src/oaklib/implementations/semsimian/semsimian_implementation.py @@ -173,6 +173,7 @@ def all_by_all_pairwise_similarity_quick( predicates: List[PRED_CURIE] = None, min_jaccard_similarity: Optional[float] = None, min_ancestor_information_content: Optional[float] = None, + embeddings_file: str = None, outfile: str = None, ) -> None: """ @@ -185,11 +186,13 @@ def all_by_all_pairwise_similarity_quick( """ objects = list(objects) logging.info(f"Calculating all-by-all pairwise similarity for {len(objects)} objects") + self.semsimian.all_by_all_pairwise_similarity_quick( subject_terms=set(subjects), object_terms=set(objects), minimum_jaccard_threshold=min_jaccard_similarity, minimum_resnik_threshold=min_ancestor_information_content, predicates=set(predicates) if predicates else None, + embeddings_file=embeddings_file.name, outfile=outfile, )