Source code for dvha.tools.name_prediction

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# tools.name_prediction.py
"""Implementation of rapidfuzz for ROI name prediction"""
# Copyright (c) 2016-2021 Dan Cutright
# This file is part of DVH Analytics, released under a BSD license.
#    See the file LICENSE included with this distribution, also
#    available at https://github.com/cutright/DVH-Analytics
from rapidfuzz import fuzz
from dvha.tools.roi_name_manager import clean_name


[docs]class ROINamePredictor:
    """ROI Name Prediction class object

    Parameters
    ----------
    roi_map : DatabaseROIs
        ROI map object
    weight_simple : float, optional
        Scaling factor for fuzz.ratio for combined score
    weight_partial : float, optional
        Scaling factor for fuzz.partial_ratio for combined score
    threshold : float, optional
        Set a minimum score for a prediction to be returned
    """

    def __init__(
        self, roi_map, weight_simple=1.0, weight_partial=0.6, threshold=0.0
    ):

        self.roi_map = roi_map
        norm_weight = weight_partial + weight_simple
        self.weight = {
            "simple": 2 * weight_simple / norm_weight,
            "partial": 2 * weight_partial / norm_weight,
        }
        self.threshold = threshold

[docs]    def get_best_roi_match(self, roi, physician, return_score=False):
        """Check all ROI variations for best match, return physician ROI

        Parameters
        ----------
        roi : str
            An ROI name
        physician : str
            Physician as stored in ROI Map
        return_score : bool, optional
             If true, return a tuple: prediction, score

        Returns
        -------
        str
            The physician ROI associated with the ROI variation that is has
            the highest combined fuzz score for ``roi``

        """
        physician_variations = self.roi_map.get_all_variations_of_physician(
            physician
        )
        fuzz_scores = self.get_combined_fuzz_scores(roi, physician_variations)
        if fuzz_scores:
            predicted_variation, score = fuzz_scores[0][1], fuzz_scores[0][0]
            prediction = self.roi_map.get_physician_roi(
                physician, predicted_variation
            )
            if score > self.threshold:
                if return_score:
                    return prediction, score
                return prediction

[docs]    def get_combined_fuzz_score(self, a, b, mode="geom_mean"):
        """Return ``combine_scores`` for strings ``a`` and ``b``

        Parameters
        ----------
        a : str
            Any string
        b : str
            Another string for comparison
        mode : str, optional
            Method for combining ``fuzz.ratio`` and ``fuzz.partial_ratio``.
            Options are 'geom_mean', 'product', and 'average'

        Returns
        -------
        float
            Results from ``combine_scores`` for ``a`` and ``b``

        """
        a, b = clean_name(a), clean_name(b)

        simple = float(fuzz.ratio(a, b) * self.weight["simple"])
        partial = float(fuzz.partial_ratio(a, b) * self.weight["partial"])

        return self.combine_scores(simple, partial, mode=mode)

[docs]    @staticmethod
    def combine_scores(score_1, score_2, mode="average"):
        """Get a combined fuzz score

        Parameters
        ----------
        score_1 : float
            A fuzz ratio score
        score_2 : float
            Another fuzz ratio score
        mode : str, optional
            Method for combining ``score_1`` and ``score_2``.
            Options are 'geom_mean', 'product', and 'average'

        Returns
        -------
        float
            Combined score

        """
        if mode == "geom_mean":
            return (score_1 * score_2) ** 0.5
        elif mode == "product":
            return score_1 * score_2 / 100.0
        else:  # average
            return (score_1 + score_2) / 2.0

[docs]    def get_combined_fuzz_scores(self, string, list_of_strings):
        """Compare a string against many

        Parameters
        ----------
        string : str
            A string to compare against each string in ``list_of_strings``
        list_of_strings : list
            A list of strings for comparison

        Returns
        -------
        list
            A list of tuples (score, string) in order of score

        """
        scores = [
            self.get_combined_fuzz_score(string, string_b)
            for string_b in list_of_strings
        ]
        if scores:
            order_index = sorted(range(len(scores)), key=lambda k: scores[k])
            return [(scores[i], list_of_strings[i]) for i in order_index[::-1]]