Source code for dodfminer.extract.pure.core

# coding=utf-8

"""Extract content from DODFS and export to JSON.

Contains class ContentExtractor which have to public functions
avaiable to extract the DODF to JSON

Usage example::

    from dodfminer.extract.pure.core import ContentExtractor

    pdf_text = ContentExtractor.extract_text(file)
    ContentExtractor.extract_to_txt(folder)

"""

import os
import json
import unicodedata

from pathlib import Path

import fitz

from dodfminer.extract.pure.utils.title_extractor import ExtractorTitleSubtitle
from dodfminer.extract.pure.utils.box_extractor import get_doc_text_boxes

RESULTS_PATH = "results/"
RESULTS_PATH_JSON = "results/json"
RESULTS_PATH_TXT = "results/txt"


[docs]class ContentExtractor: """Extract content from DODFs and export to JSON. Extracts content from DODF files using as suport the title and subtitle databases—which runs using MuPDF—, and the Tesseract OCR library. All the content is exported to a JSON file, in which its keys are DODF titles or subtitles, and its values are the correspondent content. Note: This class is not constructable, it cannot generate objects. """
[docs] @classmethod # pylint: disable=too-many-arguments def extract_text(cls, file, single=False, block=False, is_json=True, sep=' ', norm='NFKD'): """Extract block of text from file Args: file: The DODF to extract titles from. single: output content in a single file in the file directory. block: Extract the text as a list of text blocks. json: The list of text blocks are written as a json file. sep: The separator character between each block of text. norm: Type of normalization applied to the text. Note: To learn more about the each type of normalization used in the `unicode.normalization` method, `click here <https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize>`_. Returns: These are the outcomes for each parameter combination. When `block=True` and `single=True`: In case `json=True`, The method saves a JSON file containing the text blocks in the DODF file. However, is case `json=False`, the text from the whole PDF is saved as a string in a .txt file. When `block=True` and `single=False`: The method returns an array containing text blocks. Each array in the list have 5 values: the first four are the coordinates of the box from where the text was extracted (x0, y0, x1, y1), while the last is the text from the box. Example:: (127.77680206298828, 194.2507781982422, 684.0039672851562, 211.97523498535156, "ANO XLVI EDICAO EXTRA No- 4 BRASILIA - DF") When `block=False` and `single=True`: The text from the whole PDF is saved in a .txt file as a normalized string. When `block=False` and `single=False`: The method returns a normalized string containing the text from the whole PDF. """ drawboxes_text = '' list_of_boxes = [] pymu_file = fitz.open(file) for page_boxes in get_doc_text_boxes(pymu_file): for text in page_boxes: if int(text[1]) != 55 and int(text[1]) != 881: if block: norm_text = cls._normalize_text(text[4], norm) if is_json: list_of_boxes.append((text[0], text[1], text[2], text[3], norm_text)) else: drawboxes_text += (norm_text + sep) else: drawboxes_text += (text[4] + sep) if block: if not single: return list_of_boxes if is_json: cls._save_single_file(file, 'json', json.dumps(list_of_boxes)) else: return cls._save_single_file(file, 'txt', drawboxes_text) drawboxes_text = cls._normalize_text(drawboxes_text, norm) return drawboxes_text if not single else cls._save_single_file(file, 'txt', drawboxes_text)
[docs] @classmethod def extract_structure(cls, file, single=False, norm='NFKD'): # pylint: disable=too-many-locals """Extract boxes of text with their respective titles. Args: file: The DODF file to extract titles from. single: Output content in a single file in the file directory. norm: `Type of normalization <https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize>`_ applied to the text. Returns: A dictionaty with the blocks organized by title. Example:: { "Title": [ [ x0, y0, x1, y1, "Text" ] ], ... } """ content_dict = {} try: title_base = cls._extract_titles(file).json.keys() # Aqui eh realmente necessario pegar um eception generica # pylint: disable=broad-except except Exception as excpt: cls._log(excpt) return None boxes = cls.extract_text(file, block=True, norm=norm) first_title = False is_title = False actual_title = '' section = None for box in boxes: text = box[4].strip() is_title = True if text in ["SECAO I", "SECAO II", "SECAO III"]: if content_dict.get(section) or not section: section = text if section not in content_dict.keys(): content_dict.update({section: {}}) actual_title = None else: for title in title_base: text = text.replace("\n", " ") title = title.replace("\n", " ") normalized_title = cls._normalize_text(title, norm) if text == normalized_title: first_title = True actual_title = normalized_title if section and (title not in content_dict[section].keys()): content_dict[section].update( {normalized_title: []}) else: is_title = False if first_title and not is_title and section and actual_title: if int(box[1]) != 55 and int(box[1]) != 881: content_dict[section][actual_title].append(box[:5]) return content_dict if not single else cls._save_single_file(file, 'json', json.dumps(content_dict))
[docs] @classmethod def extract_to_txt(cls, folder='./', norm='NFKD'): """Extract information from DODF to a .txt file. For each PDF file in data/DODFs, the method extracts information from the PDF and writes it to the .txt file. Args: folder: The folder containing the PDFs to be extracted. norm: `Type of normalization <https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize>`_ applied to the text. """ pdfs_path_list = cls._get_pdfs_list(folder) cls._create_single_folder(os.path.join(folder, RESULTS_PATH)) cls._create_single_folder(os.path.join(folder, RESULTS_PATH_TXT)) txt_path_list = cls._get_txt_list(folder) for file in pdfs_path_list: if file[-5:] == '.json': continue pdf_name = os.path.splitext(os.path.basename(file))[0] if pdf_name not in txt_path_list: cls._log(pdf_name) text = cls.extract_text(file, norm=norm) t_path = cls._struct_subfolders(file, False, folder) with open(t_path, "w", encoding='utf-8') as file: file.write(text) else: cls._log("TXT already exists")
[docs] @classmethod def extract_to_json(cls, folder='./', titles_with_boxes=False, norm='NFKD'): """Extract information from DODF to JSON. Args: folder: The folder containing the PDFs to be extracted. titles_with_boxes: If True, the method builds a dict containing a list of tuples (similar to `extract_structure`). Otherwise, the method structures a list of tuples (similar to `extract_text`). norm: `Type of normalization <https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize>`_ applied to the text. Returns: For each PDF file in data/DODFs, extract information from the PDF and output it to a JSON file. """ # Get list of all downloaded pdfs pdfs_path_list = cls._get_pdfs_list(folder) # Get list of existing json to not repeat work json_path_list = cls._get_json_list(folder) cls._create_single_folder(os.path.join(folder, RESULTS_PATH)) cls._create_single_folder(os.path.join(folder, RESULTS_PATH_JSON)) for file in pdfs_path_list: pdf_name = os.path.splitext(os.path.basename(file))[0] # We do not want the system to repeat itself doing the same work if pdf_name not in json_path_list: # low cost extractions if os.path.getsize(file) < 30000000: # Remove in future. # Remove images that might still there from previous exec cls._log(pdf_name) if titles_with_boxes: content = cls.extract_structure(file, norm=norm) else: content = cls.extract_text(file, block=True, norm=norm) j_path = cls._struct_subfolders(file, True, folder) with open(j_path, "w", encoding="utf-8") as file: json.dump(content, file, ensure_ascii=False) else: cls._log("JSON already exists")
@classmethod def _save_single_file(cls, file_path, file_type, content): file_path, _, _ = file_path.rpartition('.pdf') file_path = f"{file_path}.{file_type}" with open(file_path, 'w+', encoding='utf-8') as file: file.write(content)
[docs] @classmethod def _normalize_text(cls, text, form='NFKD'): """This method is used for text nomalization. Args: text: The text to be normalized. form: `Type of normalization <https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize>`_ applied to the text. Returns: A string with the normalized text. """ normalized = unicodedata.normalize(form, text).encode( 'ascii', 'ignore').decode('utf8') return normalized
[docs] @classmethod def _extract_titles(cls, file): """Extract titles and subtitles from the DODF. Args: file: The DODF to extract the titles. Returns: An object of type ExtractorTitleSubtitle, in which have the attributes: titles: get all titles from PDF. subtitle: get all subtitles from PDF. Raises: Exception: error in extracting titles from PDF. """ try: title_database = ExtractorTitleSubtitle(file) cls._log(file) except Exception as exct: cls._log(f"Error in extracting files from {file}: {exct}") raise else: return title_database
[docs] @classmethod def _get_pdfs_list(cls, folder): """Get DODFs list from the path. Args: folder: The folder containing the PDFs to be extracted. Returns: A list of DODFS' PDFs paths. """ pdfs_path_list = [] for dir_path, _, file_names in os.walk(os.path.expanduser(os.path.join(folder))): for file in file_names: if '.pdf' in file: pdfs_path_list.append(os.path.join(dir_path, file)) return pdfs_path_list
[docs] @classmethod def _get_json_list(cls, folder): """Get list of exisiting JSONs from the path. Args: folder: The folder containing the PDFs to be extracted. Returns: A list of all exisiting JSONs. """ aux = [] for dir_path, _, file_names in os.walk(os.path.expanduser(os.path.join(folder, RESULTS_PATH_JSON))): for file in file_names: aux.append(os.path.join(dir_path, file)) json_path_list = [] for file in aux: json_path_list.append(os.path.splitext(os.path.basename(file))[0]) return json_path_list
[docs] @classmethod def _get_txt_list(cls, folder): """Get list of exisiting .txt files from the path. Args: folder: The folder containing the PDFs to be extracted. Returns: A list of all exisiting .txt files. """ aux = [] for dir_path, _, file_names in os.walk(os.path.expanduser(os.path.join(folder, RESULTS_PATH_TXT))): for file in file_names: aux.append(os.path.join(dir_path, file)) txt_path_list = [] for file in aux: txt_path_list.append(os.path.splitext(os.path.basename(file))[0]) return txt_path_list
[docs] @classmethod def _struct_subfolders(cls, path, json_f, folder): """Creates a directory for the JSON files. This method structures the folder tree for the allocation of files the code is curretly dealing with. Args: path: The path to the extracted file. json_f (boolean): If True, the file will extracted to a JSON. Otherwise, it will be extrated to a .txt. folder: The folder containing the PDFs to be extracted. Raises: FileExistsError: The folder being created is already there. Returns: The path created for the JSON to be saved. """ type_f = '.json' if json_f else '.txt' res_path = RESULTS_PATH_JSON if json_f else RESULTS_PATH_TXT path = path.replace(folder, "", 1) splited = path.split('/') basename = splited[-1].split('.') basename = basename[0] + type_f splited[-1] = basename final_path = '/'.join(splited[1:]) final_path = os.path.join(folder, res_path, final_path) path = Path(os.path.join(folder, res_path, *splited[:-1])) try: path.mkdir(parents=True) except FileExistsError: pass return final_path
[docs] @classmethod def _create_single_folder(cls, path): """Create a single folder given the directory path. This function might create a folder, observe if the folder already exists, or raise an error if the folder cannot be created. Args: path: The path to be created. Raises: OSError: Error creating the directory. """ if os.path.exists(path): cls._log(os.path.basename(path) + " folder already exist") else: try: os.mkdir(path) except OSError as error: cls._log("Exception during the directory creation") cls._log(str(error)) else: basename = os.path.basename(path) cls._log(basename + " directory successful created")
[docs] @classmethod def _log(cls, msg): """Print message from within the ContentExtractor class. Args: msg: String with message that should be printed out. """ print(f"[EXTRACTOR] {msg}")