Skip to content

Commit

Permalink
cosine similarity now calculated
Browse files Browse the repository at this point in the history
  • Loading branch information
hrshdhgd committed Jul 15, 2023
1 parent d97b3b7 commit 667276b
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 0 deletions.
10 changes: 10 additions & 0 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
from oaklib.datamodels.text_annotator import TextAnnotationConfiguration
from oaklib.datamodels.validation_datamodel import ValidationConfiguration
from oaklib.datamodels.vocabulary import (
DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN,
DEVELOPS_FROM,
EQUIVALENT_CLASS,
HAS_OBO_NAMESPACE,
Expand Down Expand Up @@ -2649,6 +2650,11 @@ def similarity_pair(terms, predicates, autolabel: bool, output: TextIO, output_t
type=float,
help="Minimum value for information content",
)
@click.option(
"--embeddings-file",
type=click.File(mode="r"),
help="file containing embeddings of all necessary nodes.",
)
@click.option("-o", "--output", help="path to output")
@click.option(
"--main-score-field",
Expand All @@ -2674,6 +2680,7 @@ def similarity(
low_memory: bool,
min_jaccard_similarity: Optional[float],
min_ancestor_information_content: Optional[float],
embeddings_file: TextIO,
main_score_field,
output_type,
output,
Expand Down Expand Up @@ -2773,10 +2780,13 @@ def similarity(
predicates=actual_predicates,
min_jaccard_similarity=min_jaccard_similarity,
min_ancestor_information_content=min_ancestor_information_content,
embeddings_file=embeddings_file,
outfile=output,
)

# Read the output file line by line and store the contents in a list
if output is None:
output = DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN
with open(output, "r") as f:
lines = f.readlines()

Expand Down
2 changes: 2 additions & 0 deletions src/oaklib/datamodels/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@
OWL_VERSION_INFO = "owl:versionInfo"
OWL_VERSION_IRI = "owl:versionIRI"

DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN = "similarity_map.tsv"


class SEMAPV(Enum):
"""SEMAPV Enum containing different mapping_justification."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ def all_by_all_pairwise_similarity_quick(
predicates: List[PRED_CURIE] = None,
min_jaccard_similarity: Optional[float] = None,
min_ancestor_information_content: Optional[float] = None,
embeddings_file: str = None,
outfile: str = None,
) -> None:
"""
Expand All @@ -185,11 +186,13 @@ def all_by_all_pairwise_similarity_quick(
"""
objects = list(objects)
logging.info(f"Calculating all-by-all pairwise similarity for {len(objects)} objects")

self.semsimian.all_by_all_pairwise_similarity_quick(
subject_terms=set(subjects),
object_terms=set(objects),
minimum_jaccard_threshold=min_jaccard_similarity,
minimum_resnik_threshold=min_ancestor_information_content,
predicates=set(predicates) if predicates else None,
embeddings_file=embeddings_file.name,
outfile=outfile,
)

0 comments on commit 667276b

Please sign in to comment.