#!/usr/bin/python
"""
Implements a very simple scoring function the looks for exact matches
of the entity's name components in the text of the document.


Copyright (c) 2012 Computable Insights LLC
released under the MIT X11 License, see license.txt
"""

## import standard libraries
import re
import sys
import string
import traceback

## make a unicode translation table to converts all punctuation to white space
strip_punctuation = dict((ord(char), u" ") for char in string.punctuation)

white_space_re = re.compile("\s+")

def strip_string(s):
    """
    strips punctuation and repeated whitespace from unicode strings
    """
    return white_space_re.sub(" ", s.translate(strip_punctuation).lower())

def prepare_entities(entities):
    """
    Creates a dict keyed on entity name with the values set to a
    representation that is effiicent for the scorer
    """
    prep = {}
    for name in entities:
        ## create set of tokens from entity's name
        parts = list(set(strip_string(name).split()))

        ## add full name as one of the 'names'
        full_name = strip_string(name)
        parts.append(full_name)
        
        ## assemble dict
        prep[name] = {"parts": parts, "longest": len(full_name)}

    return prep


class Scorer:
    def __init__(self, text):
        """
        Takes text (unicode) and prepare to evaluate entity mentions
        """
        try:
            self.text = strip_string(text)
            self.ready= True
        except Exception, exc:
            ## ignore failures, such as PDFs
            #sys.exit(traceback.format_exc(exc))
            sys.stderr.write("failed to initialize on doc: %s\n" % exc)
            self.ready = False

    def compute_relevance(self, entity_representation):
        """
        Searches text for parts of entity_name

        returns score between zero and 1000, which is intended to be a
        float in [0,1] measured in thousandths.
        """
        ## look for name parts in text:
        scores = []
        for name in entity_representation["parts"]:
            if name in self.text:
                scores.append(len(name))

        ## default score is 0
        if not scores:
            return 0

        ## normalize score by length of longest name, which is full_name
        score_zero_to_one = float(max(scores)) / entity_representation["longest"]

        ## return score in thousandths
        return int(1000 * score_zero_to_one)
