Browse Source

updates

master
F2256342 - Daniel de Oliveira Carvalho 1 year ago
parent
commit
279face500
  1. 1
      .env
  2. 178
      etl.py
  3. 4
      pdfScrape.ipynb

1
.env

@ -0,0 +1 @@
INPUT_FILE="OUROCARD_VISA_INFINITE-Ago_24.txt"

178
etl.py

@ -1,72 +1,138 @@
insert_query ="INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" import os
from dotenv import load_dotenv
def create_lists(): load_dotenv()
insert_query = "INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"
input_file = os.getenv('INPUT_FILE')
def create_lists(input_file: str):
import re import re
# Open the text file with open(input_file, "r", encoding="latin") as file:
with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:
# Read the contents of the file
contents = file.readlines() contents = file.readlines()
# Define the regex patterns # REGEX
dan_pattern = r'1 - DANIEL.*' owner_pattern = r"\d\s*-\s*(\w+)"
iza_pattern = r'4 - IZABELY.*' line_pattern = r"\d{2}\.\d{2}\.\d{4}.{23}.{14}.{2}\s*-?\d*\.?\d+,\d{2}\s*\d+,\d{2}"
line_pattern = r'\d{2}\.\d{2}\.\d{4}.{23}.{14}.{2}\s*\d+,\d{2}\s*\d+,\d{2}' payment_pattern = (r"\d{2}\.\d{2}\.\d{4}PGTO.*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})")
# Lists # Lists
list_dan = []
list_iza = []
current_list = None current_list = None
owner_list = []
result = {}
silly_counter = 1
# Iterate all lines # Find Owners
for line in contents: for line in contents:
line = line.strip() line = line.strip()
if re.match(dan_pattern, line):
current_list = 'list_dan' found_owners = re.findall(owner_pattern, line)
elif re.match(iza_pattern, line): if found_owners:
current_list = 'list_iza' for owner_name in found_owners:
list_name = f"list_{owner_name.lower()}"
owner_list.append(list_name)
result[list_name] = {}
result[list_name]["owner_name"] = owner_name
result[list_name]["owner_id"] = silly_counter
silly_counter = silly_counter + 1
for line in contents:
line = line.strip()
if re.match(owner_pattern, line):
found_owner = re.match(owner_pattern, line)
owner_list = f"list_{found_owner.group(1).lower()}"
current_list = owner_list
result[current_list]["tlist"] = []
else: else:
if re.match(line_pattern, line): if re.match(payment_pattern, line):
if current_list == 'list_dan': result[current_list]["tlist"].append(line)
list_dan.append(line) elif re.match(line_pattern, line):
if current_list == 'list_iza': result[current_list]["tlist"].append(line)
list_iza.append(line)
return result
return [list_dan, list_iza]
def build_insert(lists: list[list, list], account: int, owner: int): def build_insert(input_dict: dict, account: int):
from datetime import date, datetime from datetime import date, datetime
import re
insert_bulk = [] insert_bulk = []
line_group_pattern = r'(\d{2})\.(\d{2})\.(\d{4})((.+PARC (\d+)\/(\d+))(\s.{12})|(.{23})(.{14}))(.{2})(\s*\d+,\d{2})(\s*\d+,\d{2})' line_group_pattern = r"(\d{2})\.(\d{2})\.(\d{4})((.+PARC (\d+.)\/(\d+))(\s.{12})|(.{23})(.{14}))(.{2})(\s*-?\d*\.?\d+,\d{2})(\s*\d*\.?\d+,\d{2})"
payment_pattern = r"(\d{2})\.(\d{2})\.(\d{4})(PGTO DEBITO CONTA).*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})"
for batch in lists:
for item in batch: for key in input_dict:
match = re.search(line_group_pattern, item) for item in input_dict[key]["tlist"]:
tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1)))) # * check for payment
tAccount = account matches = re.match(payment_pattern, item)
if matches:
#* check for Installments tTdate = str(
if match.group(5): date(
tMemo = match.group(5) int(matches.group(3)),
tCity = match.group(8) int(matches.group(2)),
tInstallmentNr = int(match.group(6)) int(matches.group(1)),
tInstallmentTt = int(match.group(7)) )
else: )
tMemo = match.group(9) tAccount = account
tCity = match.group(10) tMemo = matches.group(4)
tInstallmentNr = 1 tCity = None
tCountry = None
tOutflow = None
tInflow = matches.group(5).strip().replace(".", "").replace(",", ".")
tOwner = input_dict[key]["owner_id"]
tInstallmentNr = None
tInstallmentTt = None tInstallmentTt = None
tCreated = str(datetime.now(tz=None))
tUpdated = None
else:
matches = re.match(line_group_pattern, item)
tTdate = str(
date(
int(matches.group(3)),
int(matches.group(2)),
int(matches.group(1)),
)
)
tAccount = account
# * check for Installments
if matches.group(5):
tMemo = matches.group(5)
tCity = matches.group(8)
tInstallmentNr = int(matches.group(6))
tInstallmentTt = int(matches.group(7))
else:
tMemo = matches.group(9)
tCity = matches.group(10)
tInstallmentNr = 1
tInstallmentTt = None
tCountry = matches.group(11)
tOutflow = matches.group(12).strip().replace(".", "").replace(",", ".")
tInflow = matches.group(13).strip().replace(".", "").replace(",", ".")
tOwner = input_dict[key]["owner_id"]
tCountry = match.group(11) tCreated = str(datetime.now(tz=None))
tOutflow = match.group(12).strip().replace(',', '.') tUpdated = None
tInflow = match.group(13).strip().replace(',', '.') insert_bulk.append(
tOwner = owner (
tTdate,
tAccount,
tMemo,
tCity,
tCountry,
tOutflow,
tInflow,
tOwner,
tInstallmentNr,
tInstallmentTt,
tCreated,
tUpdated,
)
)
tCreated = str(datetime.now(tz=None))
tUpdated = None
insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallmentNr, tInstallmentTt, tCreated, tUpdated ))
return insert_bulk return insert_bulk
def db_insert(insert_bulk: list[tuple]): def db_insert(insert_bulk: list[tuple]):
@ -74,10 +140,10 @@ def db_insert(insert_bulk: list[tuple]):
try: try:
with connect( with connect(
host='localhost', host="localhost",
user='root', user="root",
password='pleasehashapasswordomg', password="pleasehashapasswordomg",
database='default' database="default",
) as connection: ) as connection:
print("CONNECTED!", connection) print("CONNECTED!", connection)
with connection.cursor() as cursor: with connection.cursor() as cursor:
@ -89,4 +155,4 @@ def db_insert(insert_bulk: list[tuple]):
finally: finally:
connection.close() connection.close()
db_insert(build_insert(create_lists(), 1, 1)) db_insert(build_insert(create_lists(), 1))

4
pdfScrape.ipynb

@ -50,7 +50,7 @@
"import re\n", "import re\n",
"\n", "\n",
"# Open the text file\n", "# Open the text file\n",
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file=\n", "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
" # Read the contents of the file\n", " # Read the contents of the file\n",
" contents = file.read()\n", " contents = file.read()\n",
"\n", "\n",
@ -234,7 +234,7 @@
" contents = file.readlines()\n", " contents = file.readlines()\n",
"\n", "\n",
" # Define the regex patterns\n", " # Define the regex patterns\n",
" owner_pattern = r\"\\d\\s-\\s(\\w+)\"\n", " owner_pattern = r\"\\d\\s*-\\s*(\\w+)\"\n",
" line_pattern = r\"\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*-?\\d*\\.?\\d+,\\d{2}\\s*\\d+,\\d{2}\"\n", " line_pattern = r\"\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*-?\\d*\\.?\\d+,\\d{2}\\s*\\d+,\\d{2}\"\n",
" payment_pattern = (r\"\\d{2}\\.\\d{2}\\.\\d{4}PGTO.*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\")\n", " payment_pattern = (r\"\\d{2}\\.\\d{2}\\.\\d{4}PGTO.*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\")\n",
"\n", "\n",

Loading…
Cancel
Save