Source code for dodfminer.extract.pure.utils.title_filter

"""Find titles using a Filter."""

[docs]class BoldUpperCase: """Filter functions useful for bold and upper case text. Note: This class is static and should not be instanciated. """ TEXT_MIN = 4 TRASH_WORDS = [ "SUMÁRIO", "DIÁRIO OFICIAL", "SEÇÃO (I|II|III)", ] BOLD_FLAGS = [16, 20]
[docs] @classmethod def dict_text(cls, data): """Check if text is title. Evaluates to true if d['text'] matches the following conditions: - all letters are uppercase; - does not contain 4 or more consecutive spaces; - has a len greater than BoldUpperCase.TEXT_MIN/ Returns: Boolean indicating if text is title. """ text = data['text'].strip().strip('.') cond1 = 4 * " " not in text cond2 = len(text) > cls.TEXT_MIN cond3 = text == text.upper() return cond1 and cond2 and cond3
[docs] @classmethod def dict_bold(cls, data): """Hmm. Evaluates do True if d['flags'] matches the following conditions: - is one of the values in BoldUpperCase.BOLD_FLAGS """ flags = data['flags'] return flags in cls.BOLD_FLAGS