From 5737260cae1200e862f31fd3e77d189ff19f5837 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 24 Aug 2022 20:37:28 -0700 Subject: [PATCH] Implemented clustering to user agreement work (#298) --- audino/backend/Dockerfile | 1 + audino/backend/requirements.txt | 1 + audino/backend/routes/__init__.py | 2 +- audino/backend/routes/data_sci_processing.py | 208 ++++ audino/backend/routes/piha.py | 910 ------------------ .../backend/routes/project_segmentations.py | 6 +- 6 files changed, 216 insertions(+), 912 deletions(-) create mode 100644 audino/backend/routes/data_sci_processing.py delete mode 100644 audino/backend/routes/piha.py diff --git a/audino/backend/Dockerfile b/audino/backend/Dockerfile index e9226fb7..8e9167d8 100644 --- a/audino/backend/Dockerfile +++ b/audino/backend/Dockerfile @@ -8,6 +8,7 @@ RUN apk add build-base linux-headers pcre-dev COPY ./requirements.txt /app/backend +RUN pip3 install --upgrade pip RUN pip3 install --upgrade setuptools RUN pip3 install -r requirements.txt diff --git a/audino/backend/requirements.txt b/audino/backend/requirements.txt index 00e51f3d..1d4ca675 100644 --- a/audino/backend/requirements.txt +++ b/audino/backend/requirements.txt @@ -22,3 +22,4 @@ flask-redis==0.4.0 mutagen==1.45.1 pandas==1.3.4 scipy==1.7.2 +scikit-learn==1.0.2 diff --git a/audino/backend/routes/__init__.py b/audino/backend/routes/__init__.py index 3ceff11f..c05ba5a9 100644 --- a/audino/backend/routes/__init__.py +++ b/audino/backend/routes/__init__.py @@ -72,4 +72,4 @@ getNextClip, get_next_data_unknown ) -from .piha import (update_confidence_api) +from .data_sci_processing import (update_confidence_api) diff --git a/audino/backend/routes/data_sci_processing.py b/audino/backend/routes/data_sci_processing.py new file mode 100644 index 00000000..e54183da --- /dev/null +++ b/audino/backend/routes/data_sci_processing.py @@ -0,0 +1,208 @@ +import sqlalchemy as sa +from sqlalchemy import or_, and_, not_ +from sqlalchemy.sql.expression import true, false +from flask import jsonify, request +from flask_jwt_extended import jwt_required, get_jwt_identity +from sqlalchemy.orm.attributes import flag_modified +from backend import app, db +from backend.models import Project, User, Label, Data, Segmentation +from backend.models import LabelType +from . import api +from .helper_functions import ( + check_admin, + check_admin_permissions, + general_error, + missing_data +) +import numpy as np +import pandas as pd +import numpy as np +from sklearn.cluster import DBSCAN +import math +from sklearn import metrics + +@api.route("/update_confidence//", methods=["PUT"]) +@jwt_required +def update_confidence_api(project_id, data_id): + identity = get_jwt_identity() + username = identity["username"] + return update_confidence(project_id, data_id, username) + +def update_confidence(project_id, data_id, username): + app.logger.info("CHANGED CONFIDENCE LEVEL +") + project = Project.query.get(project_id) + if not project.is_iou: + return jsonify(message="iou meterics not used"), 202 + data_pt = Data.query.get(data_id) + + data_pt.set_previous_users(username) + flag_modified(data_pt, "users_reviewed") + db.session.add(data_pt) + db.session.commit() + db.session.refresh(data_pt) + + scores = np.array([]) + segmentations = Segmentation.query.filter_by(data_id=data_id).distinct() + df = make_dataframe(segmentations) + + + for manual_id in df["MANUAL ID"].unique(): + tmp_df = df[df["MANUAL ID"] == manual_id] + model, clusters, data_processed, silhoutte = run_clustering(DBSCAN_auto_dis_builder_min_dis2, tmp_df, np.unique(tmp_df["LAST MOD BY"])) + + def label_cluster(row): + segment = Segmentation.query.get(row["ID"]) + segment.set_counted(row["cluster"]) + db.session.add(segment) + return row + + data_processed.apply(label_cluster, axis=1) + scores = np.append(scores, silhoutte[0]) + + confidence = scores.mean() + data_pt.set_confidence(float(confidence)) + app.logger.info(confidence) + app.logger.info(scores) + flag_modified(data_pt, "users_reviewed") + flag_modified(data_pt, "confidence") + db.session.add(data_pt) + db.session.commit() + db.session.refresh(data_pt) + + app.logger.info("CHANGED CONFIDENCE LEVEL") + app.logger.info(data_pt.confidence) + return 200 + + + +def make_dataframe(segmentations): + OFFSET = [] + END = [] + MANUAL_ID = [] + ANNOTATION_ID = [] + LAST_MOD=[] + + + for segment in segmentations: + start = segment.start_time + end = segment.end_time + if (len(segment.values) == 0): + ANNOTATION_ID.append(segment.id) + OFFSET.append(start) + END.append(end) + MANUAL_ID.append("No class of interest") + LAST_MOD.append(segment.created_by) + + for labelCate in segment.values: + #for values in labelCate["values"]: + #TODO HANDLE EDGE CASES OF MUTLIPLE VALUES OF LABELS + ANNOTATION_ID.append(segment.id) + manual_id = labelCate.value + OFFSET.append(start) + END.append(end) + MANUAL_ID.append(manual_id) + LAST_MOD.append(segment.created_by) + df = { + "ID":tuple(ANNOTATION_ID), + "OFFSET": tuple(OFFSET), + "END TIMES": tuple(END), + "MANUAL ID": tuple(MANUAL_ID), + "LAST MOD BY": tuple(LAST_MOD) + } + return pd.DataFrame.from_dict(df) + + + + +def run_clustering(model_builder, data_oi, users, distance=1/2, agreement=1, duration=True, figure=1, verbose=False): + neighborhood_size, model = model_builder(data = data_oi, distance = distance, users = users, agreement = agreement) + if verbose: + print("neighborhood size: ", neighborhood_size) + clusters = model.fit_predict(data_oi[["OFFSET", "END TIMES"]]) + data_oi["cluster"] = clusters + + + adv_cluster_count = 0 + adv_num_unique_users = 0 + for i in range(max(clusters)): + temp = data_oi[data_oi["cluster"] == i] + adv_cluster_count += len(temp) + adv_num_unique_users += len(pd.unique(temp['LAST MOD BY'])) + #print(get_longest_distance(temp, "OFFSET", "END TIMES")) + adv_cluster_count /= int(max(clusters) + 1) + adv_num_unique_users /= int(max(clusters) + 1) #TEMP FIX INVESTIAGE HERE + + + if (verbose): + print(clusters) + print("adverage cluster size: ", adv_cluster_count) + print("adverage unqiue users per cluster size: ", adv_num_unique_users) + + silhoutte = 0 + silhoutte_users = 0 + try: + #vr = metrics.calinski_harabasz_score(data_oi[["OFFSET", "END TIMES", "DURATION"]], clusters) + silhoutte = metrics.silhouette_score(data_oi[["OFFSET", "END TIMES"]], clusters) + silhoutte = (silhoutte + 1 )/2 + silhoutte_users = (silhoutte + adv_num_unique_users/len(users))/2 + + if (verbose): + print("Variance Ratio Criterion", vr) + print("Note that VRC is less for DBSCAN in general") + print("========================================") + print("Silhoutte Score : ",silhoutte ) + print("Silhoutte Score scaled 0 - 1 : ",(silhoutte + 1 )/2) + print("scaled avg Silhoutte users : ",((silhoutte + 1 )/2+adv_num_unique_users/len(users))/2) + + except: + if (verbose): + print("ERROR: not enough clusters to create meterics") + + return model, clusters, data_oi, (silhoutte, silhoutte_users) + + + + +def DBSCAN_auto_dis_builder_min_dis2(data = None, distance = 1, users = None, agreement = 0.5, duration=False): + NEIGHBORHOOD_SCALAR = distance + + n = 0 + adv_distance = [] + dists_raw = [] + for i in range(len(users)): + user_labels = data[data['LAST MOD BY'] == users[i]] + s1 = 0 + e1 = 0 + s2 = 0 + e2 = 0 + d1 = 0 + d2 = 0 + skip = True + for index, row in user_labels.iterrows(): + #print(s1,e1,s2,e2) + s2 = float(row["OFFSET"]) + e2 = float(row["END TIMES"]) + dist = distance_cal2(s1,e1,s2,e2) + if (not skip): + dists_raw.append(dist) + + skip = False + s1 = s2 + e1 = e2 + d1 = d2 + + if len(dists_raw) == 0: + dists_raw.append(1) #TODO: Investigate edge case + adv_distance = min(dists_raw) #* NEIGHBORHOOD_SCALAR # + return adv_distance, DBSCAN( + eps=adv_distance*0.9, + min_samples=2, + ) + +def distance_cal2(s1,e1,s2,e2): + return math.sqrt((s2 - s1) * (s2 - s1) + (e2 - e1) * (e2 - e1) ) + +def distance_cal3(s1,e1,s2,e2, d1, d2): + return math.sqrt((s2 - s1) * (s2 - s1) + (e2 - e1) * (e2 - e1) + (d2 - d1) * (d2 - d1)) + + diff --git a/audino/backend/routes/piha.py b/audino/backend/routes/piha.py deleted file mode 100644 index 69271a0f..00000000 --- a/audino/backend/routes/piha.py +++ /dev/null @@ -1,910 +0,0 @@ -import sqlalchemy as sa -from sqlalchemy import or_, and_, not_ -from sqlalchemy.sql.expression import true, false -from flask import jsonify, request -from flask_jwt_extended import jwt_required, get_jwt_identity -from sqlalchemy.orm.attributes import flag_modified -from backend import app, db -from backend.models import Project, User, Label, Data, Segmentation -from backend.models import LabelType -from . import api -from .helper_functions import ( - check_admin, - check_admin_permissions, - general_error, - missing_data -) -import numpy as np -## -## Calculates IOU scores for review and quality control -## Code mostly taken from https://github.com/UCSD-E4E/PyHa -## Pyha is a great tool for passive acoustic monitoring with features that -## Allows reserachers to study and identify aduio events in their data -## Give it a look! -## - - - - - -@api.route("/update_confidence//", methods=["PUT"]) -@jwt_required -def update_confidence_api(project_id, data_id): - identity = get_jwt_identity() - username = identity["username"] - #return jsonify(message="not being used"), 202 - return update_confidence(project_id, data_id, username) - -def update_confidence(project_id, data_id, username): - project = Project.query.get(project_id) - THRESHOLD = project.threshold - if not project.is_iou: - return jsonify(message="iou meterics not used"), 202 - data_pt = Data.query.get(data_id) - request_user = User.query.filter_by(username=username - ).first() - # TODO - # Do a literature review - # COMPARE FROM AUTHOR USER ONLY - # Pairwise comparision **** Look into pairwise statistiics - # Do I take adverage or median? - - data_pt.set_previous_users(username) - flag_modified(data_pt, "users_reviewed") - db.session.add(data_pt) - db.session.commit() - db.session.refresh(data_pt) - - app.logger.info(data_pt.users_reviewed) - confidence = data_pt.confidence - confidence_adv = 0 - num_reviewers = len(data_pt.users_reviewed) # + 1 #users_reviewed not updated yet - total = 0 #NOTE CHANGE FOR OTHERS< THIS IS ONLY BECAUSE OF THE EARLIER ERROR - score = [] - columns = [] - if(num_reviewers > 0):#len(data.users_reviewed) > 0): - for user_prime in data_pt.users_reviewed: - columns.append(user_prime) - mini_scores = [] - #if (user_prime in username): - # app.logger.info([1, user_prime, user_prime]) - # mini_scores, confidence_adv = add_confidence(mini_scores, confidence_adv, 1) - # continue - segmentations_new = Segmentation.query.filter_by(data_id=data_id, created_by=user_prime).distinct() - for user in data_pt.users_reviewed: - #if (user in username): - # app.logger.info([1, user_prime, user]) - # mini_scores, confidence_adv = add_confidence(mini_scores, confidence_adv, 1) - # continue - segmentations_old = Segmentation.query.filter_by(data_id=data_id, created_by=user).distinct() - - old_df = make_dataframe(data_id, segmentations_old) - new_df = make_dataframe(data_id, segmentations_new) - app.logger.info(old_df) - app.logger.info(new_df) - if not (old_df.empty or new_df.empty): - - overlap = clip_statistics(new_df, old_df, stats_type="general")#, # - #app.logger.info(overlap) - if (len(overlap) == 0): - #pass - confidence = 0 - else: - #app.logger.info(overlap.iloc[0]) - confidence = float(overlap.iloc[0]['Global IoU']) - - app.logger.info([confidence, user_prime, user]) - mini_scores, confidence_adv = add_confidence(mini_scores, confidence_adv, confidence) - - #app.logger.info(confidence_adv) - - count = 0 - if (confidence > THRESHOLD): - count = 1 - - for segment in segmentations_new: - segment.set_counted(count) - db.session.add(segment) - - for segment in segmentations_new: - segment.set_counted(count) - db.session.add(segment) - total += 1 - score.append(mini_scores) - db.session.commit() - - #for segment in segmentations_old: - # segment.set_counted(2) - # db.session.add(segment) - - if (total == 0): - total = 1 - confidence = confidence_adv/(total) - - - - app.logger.info(columns) - scores_df = pd.DataFrame(score, columns=columns) - app.logger.info(scores_df) - app.logger.info(scores_df >= THRESHOLD) - - data_pt.set_confidence(confidence) - - #DISPLAY CSV FORM OF DATA - data_pt.set_iou_matrix(scores_df.to_csv()) - flag_modified(data_pt, "users_reviewed") - flag_modified(data_pt, "confidence") - db.session.add(data_pt) - db.session.commit() - ##app.logger.info(data_pt.confidence) - db.session.refresh(data_pt) - ##app.logger.info(confidence) - app.logger.info("CHANGED CONFIDENCE LEVEL") - app.logger.info(data_pt.confidence) - ##app.logger.info(data_pt.users_reviewed) - - ##app.logger.info("sent!") - return 200 - - -def add_confidence(mini_scores, confidence_adv, confidence): - - mini_scores.append(confidence) - confidence_adv += confidence - return mini_scores, confidence_adv - - -def make_dataframe(data_id, segmentations): - data_pt = Data.query.get(data_id) - - sample_rate = data_pt.sampling_rate - clip_length = data_pt.clip_length - filename = data_pt.original_filename - folder = "./test/" - - FOLDER = [] - FILE = [] - CHANNEL = [] - CLIP_LENGTH = [] - OFFSET = [] - DURATION = [] - MANUAL_ID = [] - SAMPLE_RATE = [] - data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} - - - for segment in segmentations: - start = segment.start_time - duration = segment.end_time - start - if (len(segment.values) == 0): - FOLDER.append(folder) - FILE.append(filename) - CHANNEL.append(0) - CLIP_LENGTH.append(clip_length) - OFFSET.append(start) - SAMPLE_RATE.append(sample_rate) - DURATION.append(duration) - MANUAL_ID.append("No class of interest") - - for labelCate in segment.values: - #for values in labelCate["values"]: - #TODO HANDLE EDGE CASES OF MUTLIPLE VALUES OF LABELS - manual_id = labelCate.value - FOLDER.append(folder) - FILE.append(filename) - CHANNEL.append(0) - CLIP_LENGTH.append(clip_length) - OFFSET.append(start) - SAMPLE_RATE.append(sample_rate) - DURATION.append(duration) - MANUAL_ID.append(manual_id) - df = {"FOLDER":tuple(FOLDER), "IN FILE": tuple(FILE), "CHANNEL": tuple(CHANNEL), "CLIP LENGTH": tuple(CLIP_LENGTH), "SAMPLE RATE": tuple(SAMPLE_RATE), "OFFSET": tuple(OFFSET), "DURATION": tuple(DURATION), "MANUAL ID": tuple(MANUAL_ID)} - return pd.DataFrame.from_dict(df) - - - - - - - - - - -## -## Start of IOU Scores -## - -import pandas as pd -from scipy import stats -import numpy as np - - -# Function that takes in a pandas dataframe of annotations and outputs a -# dataframe of the mean, median, mode, quartiles, and standard deviation of -# the annotation durations. -def annotation_duration_statistics(df): - """ - Function that calculates basic statistics related to the duration of - annotations of a Pandas Dataframe compatible with PyHa. - Args: - df (Pandas Dataframe) - - Automated labels or manual labels. - Returns: - Pandas Dataframe containing count, mean, mode, standard deviation, and - IQR values based on annotation duration. - """ - # Reading in the Duration column of the passed in dataframe as a Python - # list - annotation_lengths = df["DURATION"].to_list() - # converting to numpy array which has more readily available statistics - # functions - annotation_lengths = np.asarray(annotation_lengths) - # Converting the Python list to a numpy array - entry = {'COUNT': np.shape(annotation_lengths)[0], - 'MODE': stats.mode(np.round(annotation_lengths, 2))[0][0], - 'MEAN': np.mean(annotation_lengths), - 'STANDARD DEVIATION': np.std(annotation_lengths), - 'MIN': np.amin(annotation_lengths), - 'Q1': np.percentile(annotation_lengths, 25), - 'MEDIAN': np.median(annotation_lengths), - 'Q3': np.percentile(annotation_lengths, 75), - 'MAX': np.amax(annotation_lengths)} - # returning the dictionary as a pandas dataframe - return pd.DataFrame.from_dict([entry]) - - -def clip_general(automated_df, human_df): - """ - Function to generate a dataframe with statistics relating to the efficiency - of the automated label compared to the human label. - These statistics include true positive, false positive, false negative, - true negative, union, precision, recall, F1, and Global IoU. For general - clip overlap - Args: - automated_df (Dataframe) - - Dataframe of automated labels for one clip - human_df (Dataframe) - - Dataframe of human labels for one clip. - Returns: - Dataframe with general clip overlap statistics comparing the automated - and human labeling. - """ - # This looks at one class across one clip - clip_class = human_df["MANUAL ID"] - clip_class = list(dict.fromkeys(clip_class))[0] - duration = automated_df["CLIP LENGTH"].to_list()[0] - SAMPLE_RATE = automated_df["SAMPLE RATE"].to_list()[0] - # Initializing two arrays that will represent the human labels and - # automated labels with respect to the audio clip - # print(SIGNAL.shape) - human_arr = np.zeros((int(SAMPLE_RATE * duration),)) - bot_arr = np.zeros((int(SAMPLE_RATE * duration),)) - - folder_name = automated_df["FOLDER"].to_list()[0] - clip_name = automated_df["IN FILE"].to_list()[0] - # Placing 1s wherever the au - for row in automated_df.index: - minval = int(round(automated_df["OFFSET"][row] * SAMPLE_RATE, 0)) - maxval = int( - round( - (automated_df["OFFSET"][row] + - automated_df["DURATION"][row]) * - SAMPLE_RATE, - 0)) - bot_arr[minval:maxval] = 1 - for row in human_df.index: - minval = int(round(human_df["OFFSET"][row] * SAMPLE_RATE, 0)) - maxval = int( - round( - (human_df["OFFSET"][row] + - human_df["DURATION"][row]) * - SAMPLE_RATE, - 0)) - human_arr[minval:maxval] = 1 - - human_arr_flipped = 1 - human_arr - bot_arr_flipped = 1 - bot_arr - - true_positive_arr = human_arr * bot_arr - false_negative_arr = human_arr * bot_arr_flipped - false_positive_arr = human_arr_flipped * bot_arr - true_negative_arr = human_arr_flipped * bot_arr_flipped - IoU_arr = human_arr + bot_arr - IoU_arr[IoU_arr == 2] = 1 - - true_positive_count = np.count_nonzero( - true_positive_arr == 1) / SAMPLE_RATE - false_negative_count = np.count_nonzero( - false_negative_arr == 1) / SAMPLE_RATE - false_positive_count = np.count_nonzero( - false_positive_arr == 1) / SAMPLE_RATE - true_negative_count = np.count_nonzero( - true_negative_arr == 1) / SAMPLE_RATE - union_count = np.count_nonzero(IoU_arr == 1) / SAMPLE_RATE - - # Calculating useful values related to tp,fn,fp,tn values - - # Precision = TP/(TP+FP) - try: - precision = true_positive_count / \ - (true_positive_count + false_positive_count) - - # Recall = TP/(TP+FN) - recall = true_positive_count / \ - (true_positive_count + false_negative_count) - - # F1 = 2*(Recall*Precision)/(Recall + Precision) - - f1 = 2 * (recall * precision) / (recall + precision) - IoU = true_positive_count / union_count - except BaseException: - print('''Error calculating statistics, likely due - to zero division, setting values to zero''') - f1 = 0 - precision = 0 - recall = 0 - IoU = 0 - - # Creating a Dictionary which will be turned into a Pandas Dataframe - entry = {'FOLDER': folder_name, - 'IN FILE': clip_name, - 'MANUAL ID': clip_class, - 'TRUE POSITIVE': true_positive_count, - 'FALSE POSITIVE': false_positive_count, - 'FALSE NEGATIVE': false_negative_count, - 'TRUE NEGATIVE': true_negative_count, - 'UNION': union_count, - 'PRECISION': precision, - 'RECALL': recall, - "F1": f1, - 'Global IoU': IoU} - - return pd.DataFrame(entry, index=[0]) - - -# Will have to adjust the isolate function so that it adds a sampling rate -# onto the dataframes. -def automated_labeling_statistics( - automated_df, - manual_df, - stats_type="IoU", - threshold=0.5): - """ - Function that will allow users to easily pass in two dataframes of manual - labels and automated labels, and a dataframe is returned with statistics - examining the efficiency of the automated labelling system compared to the - human labels for multiple clips. - Calls bird_local_scores on corresponding audio clips to generate the - efficiency statistics for one specific clip which is then all put into one - dataframe of statistics for multiple audio clips. - Args: - automated_df (Dataframe) - - Dataframe of automated labels of multiple clips. - manual_df (Dataframe) - - Dataframe of human labels of multiple clips. - stats_type (String) - - String that determines which type of statistics are of interest - threshold (Float) - - Defines a threshold for certain types of statistics such as - Returns: - Dataframe of statistics comparing automated labels and human labels for - multiple clips. - """ - # Getting a list of clips - clips = automated_df["IN FILE"].to_list() - # Removing duplicates - clips = list(dict.fromkeys(clips)) - # Initializing the returned dataframe - statistics_df = pd.DataFrame() - # Looping through each audio clip - for clip in clips: - clip_automated_df = automated_df[automated_df["IN FILE"] == clip] - clip_manual_df = manual_df[manual_df["IN FILE"] == clip] - try: - if stats_type == "general": - clip_stats_df = clip_general( - clip_automated_df, clip_manual_df) - if statistics_df.empty: - statistics_df = clip_stats_df - else: - statistics_df = statistics_df.append(clip_stats_df) - elif stats_type == "IoU": - IoU_Matrix = clip_IoU(clip_automated_df, clip_manual_df) - clip_stats_df = matrix_IoU_Scores( - IoU_Matrix, clip_manual_df, threshold) - if statistics_df.empty: - statistics_df = clip_stats_df - else: - statistics_df = statistics_df.append(clip_stats_df) - - except BaseException: - app.logger.error("Something went wrong with: " + clip) - continue - statistics_df.reset_index(inplace=True, drop=True) - return statistics_df - - -def global_dataset_statistics(statistics_df, manual_id = "bird"): - """ - Function that takes in a dataframe of efficiency statistics for multiple - clips and outputs their global values. - Args: - statistics_df (Dataframe) - - Dataframe of statistics value for multiple audio clips as - returned by the function automated_labelling_statistics. - manual_id (String) - - String to control the "MANUAL ID" column of the csv file - format that is used in PyHa. Defaulted to "bird" since the - package started out with binary bird classification. - Returns: - Dataframe of global statistics for the multiple audio clips' labelling. - """ - tp_sum = statistics_df["TRUE POSITIVE"].sum() - fp_sum = statistics_df["FALSE POSITIVE"].sum() - fn_sum = statistics_df["FALSE NEGATIVE"].sum() - tn_sum = statistics_df["TRUE NEGATIVE"].sum() - union_sum = statistics_df["UNION"].sum() - precision = tp_sum / (tp_sum + fp_sum) - recall = tp_sum / (tp_sum + fn_sum) - f1 = 2 * (precision * recall) / (precision + recall) - IoU = tp_sum / union_sum - entry = {'MANUAL ID': manual_id, - 'PRECISION': round(precision, 6), - 'RECALL': round(recall, 6), - 'F1': round(f1, 6), - 'Global IoU': round(IoU, 6)} - return pd.DataFrame.from_dict([entry]) - -# TODO rework this function to implement some linear algebra, right now the -# nested for loop won't handle larger loads well To make a global matrix, find -# the clip with the most amount of automated labels and set that to the number -# of columns I believe this is currently the largest bottleneck in terms of -# temporal performance. - - -def clip_IoU(automated_df, manual_df): - """ - Function that takes in the manual and automated labels for a clip and - outputs IoU metrics of each human label with respect to each - automated label. - Args: - automated_df (Dataframe) - - Dataframe of automated labels for an audio clip. - manual_df (Dataframe) - - Dataframe of human labels for an audio clip. - Returns: - IoU_Matrix (arr) - - (human label count) x (automated label count) matrix where each - row contains the IoU of each automated annotation with respect to - a human label. - """ - - automated_df.reset_index(inplace=True, drop=True) - manual_df.reset_index(inplace=True, drop=True) - # Determining the number of rows in the output numpy array - manual_row_count = manual_df.shape[0] - # Determining the number of columns in the output numpy array - automated_row_count = automated_df.shape[0] - - # Determining the length of the input clip - duration = automated_df["CLIP LENGTH"].to_list()[0] - # Determining the sample rate of the input clip - SAMPLE_RATE = automated_df["SAMPLE RATE"].to_list()[0] - - # Initializing the output array that will contain the clip-by-clip - # Intersection over Union percentages. - IoU_Matrix = np.zeros((manual_row_count, automated_row_count)) - # print(IoU_Matrix.shape) - - # Initializing arrays that will represent each of the human and automated - # labels - bot_arr = np.zeros((int(duration * SAMPLE_RATE))) - human_arr = np.zeros((int(duration * SAMPLE_RATE))) - - # Looping through each human label - for row in manual_df.index: - # print(row) - # Determining the beginning of a human label - minval = int(round(manual_df["OFFSET"][row] * SAMPLE_RATE, 0)) - # Determining the end of a human label - maxval = int( - round( - (manual_df["OFFSET"][row] + - manual_df["DURATION"][row]) * - SAMPLE_RATE, - 0)) - # Placing the label relative to the clip - human_arr[minval:maxval] = 1 - # Looping through each automated label - for column in automated_df.index: - # Determining the beginning of an automated label - minval = int( - round( - automated_df["OFFSET"][column] * - SAMPLE_RATE, - 0)) - # Determining the ending of an automated label - maxval = int( - round( - (automated_df["OFFSET"][column] + - automated_df["DURATION"][column]) * - SAMPLE_RATE, - 0)) - # Placing the label relative to the clip - bot_arr[minval:maxval] = 1 - # Determining the overlap between the human label and the automated - # label - intersection = human_arr * bot_arr - # Determining the union between the human label and the automated - # label - union = human_arr + bot_arr - union[union == 2] = 1 - # Determining how much of the human label and the automated label - # overlap with respect to time - intersection_count = np.count_nonzero( - intersection == 1) / SAMPLE_RATE - # Determining the span of the human label and the automated label - # with respect to time. - union_count = np.count_nonzero(union == 1) / SAMPLE_RATE - # Placing the Intersection over Union Percentage into it's - # respective position in the array. - IoU_Matrix[row, column] = round( - intersection_count / union_count, 4) - # Resetting the automated label to zero - bot_arr[bot_arr == 1] = 0 - # Resetting the human label to zero - human_arr[human_arr == 1] = 0 - - return IoU_Matrix - - -def matrix_IoU_Scores(IoU_Matrix, manual_df, threshold): - """ - Function that takes in the IoU Matrix from the clip_IoU function and ouputs - the number of true positives and false positives, as well as calculating - the precision, recall, and f1 metrics. - Args: - IoU_Matrix (arr) - - (human label count) x (automated label count) matrix where each - row contains the IoU of each automated annotation with respect - to a human label. - manual_df (Dataframe) - - Dataframe of human labels for an audio clip. - threshold (float) - - IoU threshold for determining true positives, false - positives, and false negatives. - Returns: - Dataframe of clip statistics such as True Positive, False Negative, - False Positive, Precision, Recall, and F1 values for an audio clip. - """ - clip_class = manual_df["MANUAL ID"][0] - audio_dir = manual_df["FOLDER"][0] - filename = manual_df["IN FILE"][0] - # TODO make sure that all of these calculations are correct. It is - # confusing to me that the Precision and Recall scores have a positive - # correlation. Determining which automated label has the highest IoU across - # each human label - automated_label_best_fits = np.max(IoU_Matrix, axis=1) - # human_label_count = automated_label_best_fits.shape[0] - # Calculating the number of true positives based off of the passed in - # thresholds. - tp_count = automated_label_best_fits[automated_label_best_fits >= - threshold].shape[0] - # Calculating the number of false negatives from the number of human - # labels and true positives - fn_count = automated_label_best_fits[automated_label_best_fits < - threshold].shape[0] - - # Calculating the false positives - max_val_per_column = np.max(IoU_Matrix, axis=0) - fp_count = max_val_per_column[max_val_per_column < threshold].shape[0] - - # Calculating the necessary statistics - try: - recall = round(tp_count / (tp_count + fn_count), 4) - precision = round(tp_count / (tp_count + fp_count), 4) - f1 = round(2 * (recall * precision) / (recall + precision), 4) - except ZeroDivisionError: - print( - "Division by zero setting precision, recall, and f1 to zero on " + - filename) - recall = 0 - precision = 0 - f1 = 0 - - entry = {'FOLDER': audio_dir, - 'IN FILE': filename, - 'MANUAL ID': clip_class, - 'TRUE POSITIVE': tp_count, - 'FALSE NEGATIVE': fn_count, - 'FALSE POSITIVE': fp_count, - 'PRECISION': precision, - 'RECALL': recall, - 'F1': f1} - - return pd.DataFrame.from_dict([entry]) - - -def clip_catch(automated_df, manual_df): - """ - Function that determines whether or not a human label has been found across - all of the automated labels. - Args: - automated_df (Dataframe) - - Dataframe of automated labels for an audio clip. - manual_df (Dataframe) - - Dataframe of human labels for an audio clip. - Returns: - Numpy Array of statistics regarding the amount of overlap between the - manual and automated labels relative to the number of samples. - """ - # resetting the indices to make this function work - automated_df.reset_index(inplace=True, drop=True) - manual_df.reset_index(inplace=True, drop=True) - # figuring out how many automated labels and human labels exist - manual_row_count = manual_df.shape[0] - automated_row_count = automated_df.shape[0] - # finding the length of the clip as well as the sampling frequency. - duration = automated_df["CLIP LENGTH"].to_list()[0] - SAMPLE_RATE = automated_df["SAMPLE RATE"].to_list()[0] - # initializing the output array, as well as the two arrays used to - # calculate catch scores - catch_matrix = np.zeros(manual_row_count) - bot_arr = np.zeros((int(duration * SAMPLE_RATE))) - human_arr = np.zeros((int(duration * SAMPLE_RATE))) - - # Determining the automated labelled regions with respect to samples - # Looping through each human label - for row in automated_df.index: - # converting each label into a "pulse" on an array that represents the - # labels as 0's and 1's on bot array. - minval = int(round(automated_df["OFFSET"][row] * SAMPLE_RATE, 0)) - maxval = int( - round( - (automated_df["OFFSET"][row] + - automated_df["DURATION"][row]) * - SAMPLE_RATE, - 0)) - bot_arr[minval:maxval] = 1 - - # Looping through each human label and computing catch = - # (#intersections)/(#samples in label) - for row in manual_df.index: - # Determining the beginning of a human label - minval = int(round(manual_df["OFFSET"][row] * SAMPLE_RATE, 0)) - # Determining the end of a human label - maxval = int( - round( - (manual_df["OFFSET"][row] + - manual_df["DURATION"][row]) * - SAMPLE_RATE, - 0)) - # Placing the label relative to the clip - human_arr[minval:maxval] = 1 - # Determining the length of a label with respect to samples - samples_in_label = maxval - minval - # Finding where the human label and all of the annotated labels overlap - intersection = human_arr * bot_arr - # Determining how many samples overlap. - intersection_count = np.count_nonzero(intersection == 1) - # Intersection/length of label - catch_matrix[row] = round(intersection_count / samples_in_label, 4) - # resetting the human label - human_arr[human_arr == 1] = 0 - - return catch_matrix - - -# def dataset_IoU(automated_df,manual_df): -# """ -# Function that takes in two Pandas dataframes that represent human labels -# and automated labels. -# It then runs the clip_IoU function across each clip and appends the best -# fit IoU score to each labels on the manual dataframe as its output. -# -# Args: -# automated_df (Dataframe) - Dataframe of automated labels for multiple -# audio clips. -# manual_df (Dataframe) - Dataframe of human labels for multiple audio -# clips. -# -# Returns: -# Dataframe of manual labels with the best fit IoU score as a column. -# """ -# # Getting a list of clips -# clips = automated_df["IN FILE"].to_list() -# # Removing duplicates -# clips = list(dict.fromkeys(clips)) -# # Initializing the ouput dataframe -# manual_df_with_IoU = pd.DataFrame() -# for clip in clips: -# print(clip) -# # Isolating a clip from the human and automated dataframes -# clip_automated_df = automated_df[automated_df["IN FILE"] == clip] -# clip_manual_df = manual_df[manual_df["IN FILE"] == clip] -# # Calculating the IoU scores of each human label. -# IoU_Matrix = clip_IoU(clip_automated_df,clip_manual_df) -# # Finding the best automated IoU score with respect to each label -# automated_label_best_fits = np.max(IoU_Matrix,axis=1) -# clip_manual_df["IoU"] = automated_label_best_fits -# # Appending on the best fit IoU score to each human label -# if manual_df_with_IoU.empty == True: -# manual_df_with_IoU = clip_manual_df -# else: -# manual_df_with_IoU = manual_df_with_IoU.append(clip_manual_df) -# # Adjusting the indices. -# manual_df_with_IoU.reset_index(inplace = True, drop = True) -# return manual_df_with_IoU - - -# def class_IoU_Statistics(automated_df,manual_df,threshold = 0.5): -# """ -# Wrapper function that takes matrix_IoU_Scores across multiple clips from a -# class. Allows user to modify the threshold that determines whether or not -# a label is a true positive. - -# Args: -# automated_df (Dataframe) -# - Dataframe of automated labels for multiple -# audio clips. - -# manual_df (Dataframe) -# - Dataframe of human labels for multiple audio clips. - -# threshold (float) -# - IoU threshold for determining true positives, false positives, -# and false negatives. - -# Returns: -# Dataframe of IoU statistics for multiple audio clips. -# """ -# # isolating the names of the clips that have been labelled into an array. -# clips = automated_df["IN FILE"].to_list() -# clips = list(dict.fromkeys(clips)) -# # initializing the output Pandas dataframe -# # Looping through all of the clips -# for clip in clips: -# print(clip) -# clip_automated_df = automated_df[automated_df["IN FILE"] == clip] -# clip_manual_df = manual_df[manual_df["IN FILE"] == clip] -# # Computing the IoU Matrix across a specific clip -# IoU_Matrix = clip_IoU(clip_automated_df,clip_manual_df) -# # Calculating the best fit IoU to each label for the clip -# clip_stats_df = matrix_IoU_Scores(IoU_Matrix,clip_manual_df,threshold) -# # adding onto the output array. -# if IoU_Statistics.empty == True: -# IoU_Statistics = clip_stats_df -# else: -# IoU_Statistics = IoU_Statistics.append(clip_stats_df) -# IoU_Statistics.reset_index(inplace = True, drop = True) -# return IoU_Statistics - -# Consider adding in a new manual_id parameter here -def global_statistics(statistics_df, manual_id = 'N/A'): - """ - Function that takes the output of dataset_IoU Statistics and outputs a - global count of true positives and false positives, as well as computing \ - the precision, recall, and f1 metrics across the dataset. - Args: - statistics_df (Dataframe) - - Dataframe of matrix IoU scores for multiple clips. - Returns: - Dataframe of global IoU statistics which include the number of true - positives, false positives, and false negatives. Contains Precision, - Recall, and F1 metrics as well - """ - - #data_class = statistics_df["MANUAL ID"][0] - # taking the sum of the number of true positives and false positives. - tp_sum = statistics_df["TRUE POSITIVE"].sum() - fn_sum = statistics_df["FALSE NEGATIVE"].sum() - fp_sum = statistics_df["FALSE POSITIVE"].sum() - # calculating the precision, recall, and f1 - try: - precision = tp_sum / (tp_sum + fp_sum) - recall = tp_sum / (tp_sum + fn_sum) - f1 = 2 * (precision * recall) / (precision + recall) - except ZeroDivisionError: - print('''Error in calculating Precision, Recall, and F1. Likely due to - zero division, setting values to zero''') - precision = 0 - recall = 0 - f1 = 0 - # building a dictionary of the above calculations - entry = {'MANUAL ID': manual_id, - 'TRUE POSITIVE': tp_sum, - 'FALSE NEGATIVE': fn_sum, - 'FALSE POSITIVE': fp_sum, - 'PRECISION': round(precision, 4), - 'RECALL': round(recall, 4), - 'F1': round(f1, 4)} - # returning the dictionary as a pandas dataframe - return pd.DataFrame.from_dict([entry]) - - -def dataset_Catch(automated_df, manual_df): - """ - Function that determines the overlap of each human label with respect to - all of the human labels in a clip across a large number of clips. - Args: - automated_df (Dataframe) - - Dataframe of automated labels for multiple audio clips. - manual_df (Dataframe) - - Dataframe of human labels for multiple audio clips. - Returns: - Dataframe of human labels with a column for the catch values of each - label. - """ - # Getting a list of clips - clips = automated_df["IN FILE"].to_list() - # Removing duplicates - clips = list(dict.fromkeys(clips)) - # Initializing the ouput dataframe - manual_df_with_Catch = pd.DataFrame() - # Looping through all of the audio clips that have been labelled. - for clip in clips: - print(clip) - # Isolating the clips from both the automated and human dataframes - clip_automated_df = automated_df[automated_df["IN FILE"] == clip] - clip_manual_df = manual_df[manual_df["IN FILE"] == clip] - # Calling the function that calculates the catch over a specific clip - Catch_Array = clip_catch(clip_automated_df, clip_manual_df) - # Appending the catch values per label onto the manual dataframe - clip_manual_df["Catch"] = Catch_Array - if manual_df_with_Catch.empty: - manual_df_with_Catch = clip_manual_df - else: - manual_df_with_Catch = manual_df_with_Catch.append(clip_manual_df) - # Resetting the indices - manual_df_with_Catch.reset_index(inplace=True, drop=True) - return manual_df_with_Catch - - - -# Goes through each class, measuring how effective the labels are in each clip -def clip_statistics(automated_df,manual_df, stats_type = "IoU", threshold = 0.5): - # Creating identifying the overlapping classes between the two sets of dataframes - - # Creating a list of classes from the automated dataframe - automated_class_list = automated_df["MANUAL ID"].to_list() - automated_class_list = list(dict.fromkeys(automated_class_list)) - # Creating a list of classes from the manual dataframe - manual_class_list = manual_df["MANUAL ID"].to_list() - manual_class_list = list(dict.fromkeys(manual_class_list)) - # Finding the intersection between the manual and automated classes - class_list = np.intersect1d(automated_class_list,manual_class_list) - - # Initializing the output dataframe - clip_statistics = pd.DataFrame() - # Looping through each class and comparing the automated labels to the manual labels - for class_ in class_list: - #print(class_) - # isolating the current class of interest - temp_manual_class_df = manual_df[manual_df["MANUAL ID"] == class_] - temp_automated_class_df = automated_df[automated_df["MANUAL ID"] == class_] - # The case if clip_statistics hasn't been filled yet - if clip_statistics.empty: - clip_statistics = automated_labeling_statistics(temp_automated_class_df, temp_manual_class_df, stats_type = stats_type, threshold = threshold) - else: - temp_df = automated_labeling_statistics(temp_automated_class_df, temp_manual_class_df, stats_type = stats_type, threshold = threshold) - clip_statistics = clip_statistics.append(temp_df) - clip_statistics.reset_index(inplace=True,drop=True) - return clip_statistics - -def class_statistics(clip_statistics): - # Initializing the output dataframe - class_statistics = pd.DataFrame() - # creating a list of the unique classes being passed in. - class_list = clip_statistics["MANUAL ID"].to_list() - class_list = list(dict.fromkeys(class_list)) - for class_ in class_list: - #print(class_) - # isolating the current class of interest - class_df = clip_statistics[clip_statistics["MANUAL ID"] == class_] - if class_statistics.empty: - class_statistics = global_statistics(class_df, manual_id = class_) - else: - temp_df = global_statistics(class_df, manual_id = class_) - class_statistics = class_statistics.append(temp_df) - class_statistics.reset_index(inplace=True,drop=True) - return class_statistics diff --git a/audino/backend/routes/project_segmentations.py b/audino/backend/routes/project_segmentations.py index 043f78d7..dac76673 100644 --- a/audino/backend/routes/project_segmentations.py +++ b/audino/backend/routes/project_segmentations.py @@ -6,7 +6,7 @@ from . import api from .data import generate_segmentation from .helper_functions import general_error, missing_data -from .piha import (update_confidence) +from .data_sci_processing import (update_confidence) @api.route( "/projects//data//segmentations/", @@ -38,6 +38,7 @@ def delete_segmentations(project_id, data_id, seg_id): update_confidence(project_id, data_id, username) except Exception as e: + app.logger.info(e) msg = f"Could not delete segmentation" return general_error(msg, e, type="SEGMENTATION_DELETION_FAILED") @@ -147,6 +148,7 @@ def add_segmentations_batch(project_id, data_id): if (segmentation_data != None): update_confidence(project_id, data_id, username) except Exception as e: + app.logger.info(e) message = f"Could not create CONFIDENCE" return ( @@ -247,6 +249,8 @@ def add_segmentations(project_id, data_id, seg_id=None): try: update_confidence(project_id, data_id, username) except Exception as e: + app.logger.info("HEY CONFIDENCE IS BROKE") + app.logger.info(e) msg = f"Could not create CONFIDENCE" if request.method == "POST":