Source code for dodfminer.downloader.core

# coding=utf-8

"""Download DODFs from the Buriti Website and save on proper directory.

Download monthly pdfs of DODFs.

Usage example::

    downloader = Downloader()
    downloader.pull(start_date, end_date)

"""

import os
from pathlib import Path
from datetime import datetime
import tqdm
import requests
import json

from dateutil.relativedelta import relativedelta
from dodfminer.downloader.helper import check_date, get_downloads


MONTHS_STRING = ["", "01_Janeiro", "02_Fevereiro", "03_Março", "04_Abril",
                 "05_Maio", "06_Junho", "07_Julho", "08_Agosto",
                 "09_Setembro", "10_Outubro", "11_Novembro", "12_Dezembro"]


[docs]class Downloader: """Responsible for the download of the DODFs Pdfs. Args: save_path (str): Path to save the downloads. Attributes: _download_path: Folder in which the downloads will be stored. _prog_bar: Indicate if download should contain a progress bar. """ def __init__(self, save_path='./'): self._prog_bar = tqdm.tqdm() self._create_single_folder(os.path.join(save_path, 'dodfs')) self._download_path = os.path.join(save_path, 'dodfs')
[docs] @classmethod def _string_to_date(cls, date): """Convert the date to datetime. Args: date (:obj:`datetime`): The date to be converted in string format. Returns: Return the date formated in string now as datetime datatype. Raises: Exception: date passed through cli is in wrong format. """ if '/' in date: date = datetime.strptime(date, '%m/%Y').date() elif '-' in date: date = datetime.strptime(date, '%m-%Y').date() else: msg = 'start_date or end_date must be in format mm/yyyy or mm-yyyy' raise Exception(msg) return date
[docs] def _create_single_folder(self, path): """Create a single folder given the directory path. This function might create a folder, observe that the folder already exists, or raise an error if the folder cannot be created. Args: path (str): The path to be created Raises: OSError: Error creating the directory. """ if os.path.exists(path): self._log(os.path.basename(path) + " folder already exist") else: try: os.mkdir(path) except OSError as error: self._log("Exception during the directory creation") self._log(str(error)) raise else: basename = os.path.basename(path) self._log(basename + " directory successful created")
[docs] def _create_download_folder(self): """Create Downloaded DODFs Structures.""" # import pdb; pdb.set_trace() self._create_single_folder(self._download_path)
[docs] def _fail_request_message(self, url, error): """Log error messages in download. Args: url (str): The failing url to the website. error (str): The kind of error happening. """ self._log(error) message = "Please check your internet connection, and " \ f"check if the url is online via browser: {url}" self._log(message)
[docs] def _file_exist(self, path): """Check if a file exists. Prevents redownloads. Args: path (str): The path where the file might be Returns: Boolean indicating if file does really exists. """ if os.path.exists(path): self._log(os.path.basename(path) + " file already exist") return True return False
[docs] def _download_pdf(self, url, path): """Download the DODF PDF. Note: Might be time consuming depending on bandwidth. Args: url (str): The pdf url. path (str): The path to save the pdf. Raises: RequestException: Error in case the request to download fails. """ try: response = requests.get(url) response.raise_for_status() except requests.exceptions.HTTPError as error: self._fail_request_message(url, error) except requests.exceptions.RequestException as error: self._fail_request_message(url, error) else: pdf_file = Path(f"{path}.pdf") pdf_file.write_bytes(response.content) self._log("Finished " + os.path.basename(path))
[docs] def _make_month_path(self, year, actual_date): """Create and return the folder for the year and month being download. Args: year (int): The year respective to the folder. actual_date (:obj:`datetime`): The date in which the downloaded DODF corresponds. Returns: The path to the actual month in which the download is being made. """ year_path = os.path.join(self._download_path, str(actual_date.year)) if year != actual_date.year: self._create_single_folder(year_path) month_path = os.path.join(year_path, MONTHS_STRING[actual_date.month]) return month_path
[docs] def pull(self, start_date, end_date): """Make the download of the DODFs pdfs. All dodfs are downloaded from start_date to end_date inclusively. The Pdfs are saved in a folder called "data" inside the project folder. Args: start_date (str): The start date in format mm/yyyy. end_date (str): The start date in format mm/yyyy. Note: The name or the path of the save folder are hard coded and can't be changed due to some nonsense software engineer decision. """ # Convert string to datetime and calculate ammount to be used in # progress bar start_date = self._string_to_date(start_date) end_date = self._string_to_date(end_date) months_amt = ((end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)) # Creates progress bar self._prog_bar = tqdm.tqdm(total=months_amt) # # Creates the project folder structure self._create_download_folder() year = 0 for month in range(months_amt+1): actual_date = start_date + relativedelta(months=+month) desc_bar = str(actual_date) self._prog_bar.set_description(f"Date {desc_bar}") month_path = self._make_month_path(year, actual_date) year = actual_date.year year_ = str(year) month_ = MONTHS_STRING[actual_date.month] if check_date(year_, month_) is True: self._create_single_folder(month_path) else: print( f"*** There are still no DODFs for that date: {actual_date.month}/{year_} ***") continue self._get_dodfs(get_downloads(year_, month_), month_path) self._prog_bar.update(1)
[docs] def pull_json(self, JSON_URL): """Download the DODF JSON file available on the current day. The file is saved either in the path provided or in the default 'dodf' directory. Note: There is no way of downloading JSON files from past days because they are not provided. """ try: response = requests.get(JSON_URL) if response.status_code == 200: # Creates and saves the JSON file json_data = response.json() json_title = json_data['lstJornalDia'][0][:-5] + '.json' json_path = os.path.join(self._download_path, json_title) with open(json_path, "w") as file: json.dump(json_data, file) print('\nThe JSON file has been downloaded successfully from ' + JSON_URL + '.') except requests.exceptions.HTTPError as error: self._fail_request_message(JSON_URL, error) except requests.exceptions.RequestException as error: self._fail_request_message(JSON_URL, error)
def _get_dodfs(self, _links_for_each_dodf, month_path): """Create folder and stores the DODFs pdfs. Args: _links_for_each_dodf (dict): a dicts with links for each DODF. month_path (str): path to store DODFs pdfs. """ for dodf_name, links in _links_for_each_dodf.items(): dodf_path = month_path if len(links) > 1: dodf_path = os.path.join(month_path, dodf_name) self._create_single_folder(dodf_path) index = 0 for link in links: index += 1 download_link = link if len(links) == 1: dodf_name_path = os.path.join(dodf_path, dodf_name) else: dodf_name_path = os.path.join( dodf_path, f'{dodf_name} {index}') if not self._file_exist(dodf_name_path): self._log("Downloding " + os.path.basename(dodf_name_path)) self._download_pdf(download_link, dodf_name_path) else: self._log("Jumping to the next") def get_download_path(self): return self._download_path
[docs] def _log(self, message): """Logs a message following the downloader pattern. Args: message (str): The message to be logged. """ self._prog_bar.write("[DOWNLOADER] " + str(message))
if __name__ == '__main__': downloader = Downloader(save_path='./') downloader.pull(start_date="05/2021", end_date="06/2021")