Source code for dodfminer.extract.polished.backend.ner

"""NER backend for act and propriety extraction.

This module contains the ActNER class, which have all that is necessary to
extract an act and, its proprieties, using a trained ner model.

"""

import re
import nltk
import numpy as np

# pylint: disable=too-few-public-methods

[docs]class ActNER:
    """Act NER Class.

    This class encapsulate all functions, and attributes related
    to the process of NER extraction.

    Note:
        This class is one of the fathers of the Base act class.

    Attributes:
        _model: The trained NER model for the act

    """

    def __init__(self):
        # self._backend = 'regex'
        nltk.download('punkt', quiet=True)
        super().__init__()

        # pylint: disable=assignment-from-no-return
        self._model = self._load_model()
        self._preds = []

[docs]    def _load_model(self):
        """Load Model from models/folder.

        Note:
            This function needs to be overwriten in
            the child class. If this function is not
            overwrite the backend will fall back to regex.

        """
        # pylint: disable=access-member-before-definition
        if self._backend == 'ner':
            print(
                f"Act {self._name} does not have an entity extraction model: FALLING BACK TO REGEX")
            self._backend = 'regex'
        else:
            self._backend = 'regex'

[docs]    def _prediction(self, act):
        """Predict classes for a single act.

        Args:
            act (string): Full act

        Returns:
            A dictionary with the proprieties and its
            predicted value.
        """
        act = self._preprocess(act)
        feats = self._get_features(self._split_sentence(act))
        pred = self._model.predict_single(feats)
        self._preds.append(pred)
        return self._predictions_dict(act, pred)

[docs]    @classmethod
    def _preprocess(cls, text):
        """Preprocess text for CRF model."""
        text = text.replace('\n', ' ').strip()
        text = re.sub(' +', ' ', text)
        text = re.sub(r'([a-zA-Z0-9])- ', r'\1', text)
        return text

[docs]    @classmethod
    def _limits(cls, sentence):
        """Find the limits of words in the sentence.

        Args:
            sentence (str): target sentence.

        Returns:
            List of the positions in which each word in sentence starts.
        """
        letters = [chr(c) for c in range(ord('a'), ord('z') + 1)]
        numbers = [chr(c) for c in range(ord('0'), ord('9') + 1)]
        symbols = ['(', ',', '.', '/', '-']
        all_chars = letters + numbers + symbols + [' ']

        lim = []
        if sentence[0] != ' ':
            lim.append(0)

        for i in range(1, len(sentence)):
            current = sentence[i].lower()
            previous = sentence[i-1].lower()

            if current in letters and previous not in letters:
                lim.append(i)
            elif current in numbers and previous not in numbers:
                lim.append(i)
            elif current in symbols:
                lim.append(i)
            elif current not in all_chars and previous in letters:
                lim.append(i)
        return lim

[docs]    def _split_sentence(self, sentence):
        """Split a sentence into words.

        Args:
            sentence (str): Sentence to be split.

        Returns:
            List of words in the sentence.
        """
        lim = self._limits(sentence)
        lim.append(len(sentence))

        words = []
        for i in range(1, len(lim)):
            words.append(sentence[lim[i-1]:lim[i]].strip())
        return words

[docs]    @classmethod
    def _get_base_feat(cls, word):
        """Get the base features of a word, for the CRF model.

        Args:
            word (str): Word to be processed.

        Returns:
            Dictionary with the base features of the word.
        """
        features_dict = {
            'word': word.lower(),
            'is_title': word.istitle(),
            'is_upper': word.isupper(),
            'num_digits': str(sum(c.isdigit() for c in word)),
        }
        return features_dict

[docs]    def _add_base_feat(self, features, sentence, index, prefix):
        """Updates a dictionary of features with the features of a word.

        Args:
            features (dict): Dictionary with the features already processed.
            sentence (list): List of words in the sentence.
            index (int): Index of the current word in the sentence.
            prefix (str): Prefix to be added to the name of the features of the current word.

        """
        if 0 <= index < len(sentence):
            word_feat = self._get_base_feat(sentence[index])
            for feat,_ in word_feat.items():
                features[prefix + feat] = word_feat[feat]

[docs]    def _get_features(self, sentence):
        """Get the features of a sentence, for the CRF model.

        Args:
            sentence (list): List of words in the sentence.

        Returns:
            List of dictionaries with the features of each word.
        """
        sent_features = []

        for i,_ in enumerate(sentence):

            word_feat = {
                'bias': 1.0,
                'text_position': i/len(sentence),
            }

            self._add_base_feat(word_feat, sentence, i-4, '-4:')
            self._add_base_feat(word_feat, sentence, i-3, '-3:')
            self._add_base_feat(word_feat, sentence, i-2, '-2:')
            self._add_base_feat(word_feat, sentence, i-1, '-1:')

            self._add_base_feat(word_feat, sentence, i, '')

            self._add_base_feat(word_feat, sentence, i+1, '+1:')
            self._add_base_feat(word_feat, sentence, i+2, '+2:')
            self._add_base_feat(word_feat, sentence, i+3, '+3:')
            self._add_base_feat(word_feat, sentence, i+4, '+4:')

            sent_features.append(word_feat)

        return sent_features

[docs]    def _predictions_dict(self, sentence, prediction):
        """Create dictionary of proprieties.

        Create dictionary of tags to save predicted entities.

        Args:
            sentence (list): List of words and tokens in the act.
            prediction ([type]): The correspondent predicitons for each
                                 word in the sentence.

        Returns:
            A dictionary of the proprieties found.

        """

        dict_ato = {}
        for klass in self._model.classes_:
            if klass == 'O':
                continue
            dict_ato[klass[2:]] = []

        limits = self._limits(sentence)

        limits.append(len(sentence))
        prediction.append('O')

        current = ''
        entity_start = -1
        for i,_ in enumerate(prediction):
            if current != '' and prediction[i] != 'I-' + current:
                entity_end = limits[i]
                dict_ato[current].append(
                    sentence[entity_start:entity_end].strip())
                entity_start = -1
                current = ''

            if prediction[i][0] == 'B' or (prediction[i][0] == 'I' and current == ''):
                current = prediction[i][2:]
                entity_start = limits[i]

        for key, val in dict_ato.items():
            if len(val) == 0:
                dict_ato[key] = np.nan
            elif len(val) == 1:
                dict_ato[key] = val[0]

        return dict_ato