diff --git a/.env b/.env new file mode 100644 index 0000000..c813588 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +INPUT_FILE="OUROCARD_VISA_INFINITE-Ago_24.txt" \ No newline at end of file diff --git a/etl.py b/etl.py index 3f64216..c3df4dd 100644 --- a/etl.py +++ b/etl.py @@ -1,72 +1,138 @@ -insert_query ="INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" +import os +from dotenv import load_dotenv -def create_lists(): +load_dotenv() + +insert_query = "INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" +input_file = os.getenv('INPUT_FILE') + +def create_lists(input_file: str): import re - # Open the text file - with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file: - # Read the contents of the file + with open(input_file, "r", encoding="latin") as file: contents = file.readlines() - # Define the regex patterns - dan_pattern = r'1 - DANIEL.*' - iza_pattern = r'4 - IZABELY.*' - line_pattern = r'\d{2}\.\d{2}\.\d{4}.{23}.{14}.{2}\s*\d+,\d{2}\s*\d+,\d{2}' + # REGEX + owner_pattern = r"\d\s*-\s*(\w+)" + line_pattern = r"\d{2}\.\d{2}\.\d{4}.{23}.{14}.{2}\s*-?\d*\.?\d+,\d{2}\s*\d+,\d{2}" + payment_pattern = (r"\d{2}\.\d{2}\.\d{4}PGTO.*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})") # Lists - list_dan = [] - list_iza = [] current_list = None + owner_list = [] + result = {} + + silly_counter = 1 - # Iterate all lines + # Find Owners for line in contents: line = line.strip() - if re.match(dan_pattern, line): - current_list = 'list_dan' - elif re.match(iza_pattern, line): - current_list = 'list_iza' + + found_owners = re.findall(owner_pattern, line) + if found_owners: + for owner_name in found_owners: + list_name = f"list_{owner_name.lower()}" + owner_list.append(list_name) + result[list_name] = {} + result[list_name]["owner_name"] = owner_name + result[list_name]["owner_id"] = silly_counter + silly_counter = silly_counter + 1 + + for line in contents: + line = line.strip() + + if re.match(owner_pattern, line): + found_owner = re.match(owner_pattern, line) + owner_list = f"list_{found_owner.group(1).lower()}" + current_list = owner_list + result[current_list]["tlist"] = [] else: - if re.match(line_pattern, line): - if current_list == 'list_dan': - list_dan.append(line) - if current_list == 'list_iza': - list_iza.append(line) - - return [list_dan, list_iza] - -def build_insert(lists: list[list, list], account: int, owner: int): + if re.match(payment_pattern, line): + result[current_list]["tlist"].append(line) + elif re.match(line_pattern, line): + result[current_list]["tlist"].append(line) + + return result + + +def build_insert(input_dict: dict, account: int): from datetime import date, datetime + import re insert_bulk = [] - line_group_pattern = r'(\d{2})\.(\d{2})\.(\d{4})((.+PARC (\d+)\/(\d+))(\s.{12})|(.{23})(.{14}))(.{2})(\s*\d+,\d{2})(\s*\d+,\d{2})' - - for batch in lists: - for item in batch: - match = re.search(line_group_pattern, item) - tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1)))) - tAccount = account - - #* check for Installments - if match.group(5): - tMemo = match.group(5) - tCity = match.group(8) - tInstallmentNr = int(match.group(6)) - tInstallmentTt = int(match.group(7)) - else: - tMemo = match.group(9) - tCity = match.group(10) - tInstallmentNr = 1 + line_group_pattern = r"(\d{2})\.(\d{2})\.(\d{4})((.+PARC (\d+.)\/(\d+))(\s.{12})|(.{23})(.{14}))(.{2})(\s*-?\d*\.?\d+,\d{2})(\s*\d*\.?\d+,\d{2})" + payment_pattern = r"(\d{2})\.(\d{2})\.(\d{4})(PGTO DEBITO CONTA).*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})" + + for key in input_dict: + for item in input_dict[key]["tlist"]: + # * check for payment + matches = re.match(payment_pattern, item) + if matches: + tTdate = str( + date( + int(matches.group(3)), + int(matches.group(2)), + int(matches.group(1)), + ) + ) + tAccount = account + tMemo = matches.group(4) + tCity = None + tCountry = None + tOutflow = None + tInflow = matches.group(5).strip().replace(".", "").replace(",", ".") + tOwner = input_dict[key]["owner_id"] + tInstallmentNr = None tInstallmentTt = None + tCreated = str(datetime.now(tz=None)) + tUpdated = None + else: + matches = re.match(line_group_pattern, item) + tTdate = str( + date( + int(matches.group(3)), + int(matches.group(2)), + int(matches.group(1)), + ) + ) + tAccount = account + + # * check for Installments + if matches.group(5): + tMemo = matches.group(5) + tCity = matches.group(8) + tInstallmentNr = int(matches.group(6)) + tInstallmentTt = int(matches.group(7)) + else: + tMemo = matches.group(9) + tCity = matches.group(10) + tInstallmentNr = 1 + tInstallmentTt = None + + tCountry = matches.group(11) + tOutflow = matches.group(12).strip().replace(".", "").replace(",", ".") + tInflow = matches.group(13).strip().replace(".", "").replace(",", ".") + tOwner = input_dict[key]["owner_id"] - tCountry = match.group(11) - tOutflow = match.group(12).strip().replace(',', '.') - tInflow = match.group(13).strip().replace(',', '.') - tOwner = owner + tCreated = str(datetime.now(tz=None)) + tUpdated = None + insert_bulk.append( + ( + tTdate, + tAccount, + tMemo, + tCity, + tCountry, + tOutflow, + tInflow, + tOwner, + tInstallmentNr, + tInstallmentTt, + tCreated, + tUpdated, + ) + ) - tCreated = str(datetime.now(tz=None)) - tUpdated = None - insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallmentNr, tInstallmentTt, tCreated, tUpdated )) - return insert_bulk def db_insert(insert_bulk: list[tuple]): @@ -74,10 +140,10 @@ def db_insert(insert_bulk: list[tuple]): try: with connect( - host='localhost', - user='root', - password='pleasehashapasswordomg', - database='default' + host="localhost", + user="root", + password="pleasehashapasswordomg", + database="default", ) as connection: print("CONNECTED!", connection) with connection.cursor() as cursor: @@ -89,4 +155,4 @@ def db_insert(insert_bulk: list[tuple]): finally: connection.close() -db_insert(build_insert(create_lists(), 1, 1)) \ No newline at end of file +db_insert(build_insert(create_lists(), 1)) \ No newline at end of file diff --git a/pdfScrape.ipynb b/pdfScrape.ipynb index dbf46d4..5e9d089 100644 --- a/pdfScrape.ipynb +++ b/pdfScrape.ipynb @@ -50,7 +50,7 @@ "import re\n", "\n", "# Open the text file\n", - "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file=\n", + "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", " # Read the contents of the file\n", " contents = file.read()\n", "\n", @@ -234,7 +234,7 @@ " contents = file.readlines()\n", "\n", " # Define the regex patterns\n", - " owner_pattern = r\"\\d\\s-\\s(\\w+)\"\n", + " owner_pattern = r\"\\d\\s*-\\s*(\\w+)\"\n", " line_pattern = r\"\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*-?\\d*\\.?\\d+,\\d{2}\\s*\\d+,\\d{2}\"\n", " payment_pattern = (r\"\\d{2}\\.\\d{2}\\.\\d{4}PGTO.*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\")\n", "\n",