Source code for dvha.tools.name_prediction

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# tools.name_prediction.py
"""Implementation of rapidfuzz for ROI name prediction"""
# Copyright (c) 2016-2021 Dan Cutright
# This file is part of DVH Analytics, released under a BSD license.
#    See the file LICENSE included with this distribution, also
#    available at https://github.com/cutright/DVH-Analytics
from rapidfuzz import fuzz
from dvha.tools.roi_name_manager import clean_name


[docs]class ROINamePredictor: """ROI Name Prediction class object Parameters ---------- roi_map : DatabaseROIs ROI map object weight_simple : float, optional Scaling factor for fuzz.ratio for combined score weight_partial : float, optional Scaling factor for fuzz.partial_ratio for combined score threshold : float, optional Set a minimum score for a prediction to be returned """ def __init__( self, roi_map, weight_simple=1.0, weight_partial=0.6, threshold=0.0 ): self.roi_map = roi_map norm_weight = weight_partial + weight_simple self.weight = { "simple": 2 * weight_simple / norm_weight, "partial": 2 * weight_partial / norm_weight, } self.threshold = threshold
[docs] def get_best_roi_match(self, roi, physician, return_score=False): """Check all ROI variations for best match, return physician ROI Parameters ---------- roi : str An ROI name physician : str Physician as stored in ROI Map return_score : bool, optional If true, return a tuple: prediction, score Returns ------- str The physician ROI associated with the ROI variation that is has the highest combined fuzz score for ``roi`` """ physician_variations = self.roi_map.get_all_variations_of_physician( physician ) fuzz_scores = self.get_combined_fuzz_scores(roi, physician_variations) if fuzz_scores: predicted_variation, score = fuzz_scores[0][1], fuzz_scores[0][0] prediction = self.roi_map.get_physician_roi( physician, predicted_variation ) if score > self.threshold: if return_score: return prediction, score return prediction
[docs] def get_combined_fuzz_score(self, a, b, mode="geom_mean"): """Return ``combine_scores`` for strings ``a`` and ``b`` Parameters ---------- a : str Any string b : str Another string for comparison mode : str, optional Method for combining ``fuzz.ratio`` and ``fuzz.partial_ratio``. Options are 'geom_mean', 'product', and 'average' Returns ------- float Results from ``combine_scores`` for ``a`` and ``b`` """ a, b = clean_name(a), clean_name(b) simple = float(fuzz.ratio(a, b) * self.weight["simple"]) partial = float(fuzz.partial_ratio(a, b) * self.weight["partial"]) return self.combine_scores(simple, partial, mode=mode)
[docs] @staticmethod def combine_scores(score_1, score_2, mode="average"): """Get a combined fuzz score Parameters ---------- score_1 : float A fuzz ratio score score_2 : float Another fuzz ratio score mode : str, optional Method for combining ``score_1`` and ``score_2``. Options are 'geom_mean', 'product', and 'average' Returns ------- float Combined score """ if mode == "geom_mean": return (score_1 * score_2) ** 0.5 elif mode == "product": return score_1 * score_2 / 100.0 else: # average return (score_1 + score_2) / 2.0
[docs] def get_combined_fuzz_scores(self, string, list_of_strings): """Compare a string against many Parameters ---------- string : str A string to compare against each string in ``list_of_strings`` list_of_strings : list A list of strings for comparison Returns ------- list A list of tuples (score, string) in order of score """ scores = [ self.get_combined_fuzz_score(string, string_b) for string_b in list_of_strings ] if scores: order_index = sorted(range(len(scores)), key=lambda k: scores[k]) return [(scores[i], list_of_strings[i]) for i in order_index[::-1]]