Browse Source

updates

master
Yutsuo 1 year ago
parent
commit
5cf373f531
  1. 117
      etl.py

117
etl.py

@ -4,18 +4,27 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
insert_query = "INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" insert_query = "INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"
input_file = os.getenv('INPUT_FILE') input_file = os.getenv("INPUT_FILE")
def create_lists(input_file: str):
def create_lists():
import re import re
with open(input_file, "r", encoding="latin") as file: # Open the text file
# with open("OUROCARD_VISA_INFINITE-Próxima_Fatura.txt", "r", encoding="latin") as file:
with open("OUROCARD_VISA_INFINITE-Ago_24.txt", "r", encoding="latin") as file:
# Read the contents of the file
contents = file.readlines() contents = file.readlines()
# REGEX # Define the regex patterns
owner_pattern = r"\d\s*-\s*(\w+)" owner_pattern = r"\d\s?-\s?([A-Z]+)"
line_pattern = r"\d{2}\.\d{2}\.\d{4}.{23}.{14}.{2}\s*-?\d*\.?\d+,\d{2}\s*\d+,\d{2}" line_pattern = r"\d{2}\.\d{2}\.\d{4}.{23}.{14}.{2}\s*-?\d*\.?\d+,\d{2}\s*\d+,\d{2}"
payment_pattern = (r"\d{2}\.\d{2}\.\d{4}PGTO.*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})") payment_pattern = (
r"\d{2}\.\d{2}\.\d{4}PGTO.*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})"
)
partial_invoice_line_pattern = (
r"\d{2}\/\d{2}.{27}.{16}.{2}\s+\s*-?\d*\.?\d+,\d{2}\s*\d+,\d{2}"
)
# Lists # Lists
current_list = None current_list = None
@ -23,6 +32,7 @@ def create_lists(input_file: str):
result = {} result = {}
silly_counter = 1 silly_counter = 1
isPartial = True
# Find Owners # Find Owners
for line in contents: for line in contents:
@ -38,6 +48,7 @@ def create_lists(input_file: str):
result[list_name]["owner_id"] = silly_counter result[list_name]["owner_id"] = silly_counter
silly_counter = silly_counter + 1 silly_counter = silly_counter + 1
# Treat and create transaction lists
for line in contents: for line in contents:
line = line.strip() line = line.strip()
@ -49,9 +60,19 @@ def create_lists(input_file: str):
else: else:
if re.match(payment_pattern, line): if re.match(payment_pattern, line):
result[current_list]["tlist"].append(line) result[current_list]["tlist"].append(line)
elif re.match(line_pattern, line): elif re.match(line_pattern, line) or re.match(
partial_invoice_line_pattern, line
):
result[current_list]["tlist"].append(line) result[current_list]["tlist"].append(line)
# Check file pattern
sample = result[current_list]["tlist"][0]
if re.match(line_pattern, sample):
isPartial = False
for listObj in result:
result[listObj]["isPartial"] = isPartial
return result return result
@ -60,58 +81,81 @@ def build_insert(input_dict: dict, account: int):
import re import re
insert_bulk = [] insert_bulk = []
line_group_pattern = r"(\d{2})\.(\d{2})\.(\d{4})((.+PARC (\d+.)\/(\d+))(\s.{12})|(.{23})(.{14}))(.{2})(\s*-?\d*\.?\d+,\d{2})(\s*\d*\.?\d+,\d{2})"
payment_pattern = r"(\d{2})\.(\d{2})\.(\d{4})(PGTO DEBITO CONTA).*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})" # RegEx Patterns
line_group_pattern = r"(?P<day>\d{2})\.(?P<month>\d{2})\.(?P<year>\d{4})(?:(?P<p_memo>.+PARC (?P<p_nr>\d+.)\/(?P<p_tt>\d+)\s.{12})|(?P<memo>.{37}))(?P<country>.{2})(?P<outflow>\s*-?\d*\.?\d+,\d{2})(?P<inflow>\s*\d*\.?\d+,\d{2})"
partial_invoice_group_pattern = r"(?P<day>\d{2})\/(?P<month>\d{2})(?:(?P<p_memo>.+PARC (?P<p_nr>\d{2})\/(?P<p_tt>\d{2}).{15})|(?P<memo>.{43}))(?P<country>.{2})(?P<outflow>\s+\s*-?\d*\.?\d+,\d{2})(?P<inflow>\s*\d+,\d{2})"
payment_pattern = r"(?P<day>\d{2})\.(?P<month>\d{2})\.(?P<year>\d{4})(?P<memo>PGTO DEBITO CONTA).*200211(?P<inflow>\s*-?\d*\.?\d+,\d{2})(?P<outflow>\s*\d+,\d{2})"
for key in input_dict: for key in input_dict:
if input_dict[key]["isPartial"]:
pattern_to_use = partial_invoice_group_pattern
else:
pattern_to_use = line_group_pattern
for item in input_dict[key]["tlist"]: for item in input_dict[key]["tlist"]:
# * check for payment # check for payment
matches = re.match(payment_pattern, item) matches = re.match(payment_pattern, item)
if matches: if matches:
tTdate = str( tTdate = str(
date( date(
int(matches.group(3)), int(matches.group("year")),
int(matches.group(2)), int(matches.group("month")),
int(matches.group(1)), int(matches.group("day")),
) )
) )
tAccount = account tAccount = account
tMemo = matches.group(4) tMemo = matches.group("memo")
tCity = None
tCountry = None tCountry = None
tOutflow = None tOutflow = None
tInflow = matches.group(5).strip().replace(".", "").replace(",", ".") tInflow = (
matches.group("inflow")
.strip()
.replace(".", "")
.replace(",", ".")
.replace("-", "")
)
tOwner = input_dict[key]["owner_id"] tOwner = input_dict[key]["owner_id"]
tInstallmentNr = None tInstallmentNr = None
tInstallmentTt = None tInstallmentTt = None
tCreated = str(datetime.now(tz=None)) tCreated = str(datetime.now(tz=None))
tUpdated = None tUpdated = None
else: else:
matches = re.match(line_group_pattern, item) matches = re.match(pattern_to_use, item)
tTdate = str( tTdate = str(
date( date(
int(matches.group(3)), # partial files will not have the year data on transactions
int(matches.group(2)), (
int(matches.group(1)), int(matches.group("year"))
if pattern_to_use == line_group_pattern
else datetime.now().year
),
int(matches.group("month")),
int(matches.group("day")),
) )
) )
tAccount = account tAccount = account
# * check for Installments tMemo = (
if matches.group(5): matches.group("p_memo")
tMemo = matches.group(5) if matches.group("p_memo")
tCity = matches.group(8) else matches.group("memo")
tInstallmentNr = int(matches.group(6)) )
tInstallmentTt = int(matches.group(7)) tInstallmentNr = (
else: int(matches.group("p_nr")) if matches.group("p_nr") else None
tMemo = matches.group(9) )
tCity = matches.group(10) tInstallmentTt = (
tInstallmentNr = 1 int(matches.group("p_tt")) if matches.group("p_tt") else None
tInstallmentTt = None )
tCountry = matches.group(11) tCountry = matches.group("country")
tOutflow = matches.group(12).strip().replace(".", "").replace(",", ".") tOutflow = (
tInflow = matches.group(13).strip().replace(".", "").replace(",", ".") matches.group("outflow").strip().replace(".", "").replace(",", ".")
)
tInflow = (
matches.group("inflow").strip().replace(".", "").replace(",", ".")
)
tOwner = input_dict[key]["owner_id"] tOwner = input_dict[key]["owner_id"]
tCreated = str(datetime.now(tz=None)) tCreated = str(datetime.now(tz=None))
@ -121,7 +165,6 @@ def build_insert(input_dict: dict, account: int):
tTdate, tTdate,
tAccount, tAccount,
tMemo, tMemo,
tCity,
tCountry, tCountry,
tOutflow, tOutflow,
tInflow, tInflow,
@ -135,6 +178,7 @@ def build_insert(input_dict: dict, account: int):
return insert_bulk return insert_bulk
def db_insert(insert_bulk: list[tuple]): def db_insert(insert_bulk: list[tuple]):
from mysql.connector import connect, Error from mysql.connector import connect, Error
@ -155,4 +199,5 @@ def db_insert(insert_bulk: list[tuple]):
finally: finally:
connection.close() connection.close()
db_insert(build_insert(create_lists(), 1)) db_insert(build_insert(create_lists(), 1))
Loading…
Cancel
Save