updates

2 years ago · 0f30fdb9f4
4 changed files with 2191 additions and 521 deletions
--- a/lab.ipynb
+++ b/lab.ipynb
--- a/pdfScrape.ipynb
+++ b/pdfScrape.ipynb
@ -1,499 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "test_file = 'OUROCARD_VISA_INFINITE-Ago_24.txt'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as reader:\n",
-    "    data = reader.read()\n",
-    "    print(data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "\n",
-    "# Open the text file\n",
-    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
-    "    # Read the contents of the file\n",
-    "    contents = file.read()\n",
-    "\n",
-    "# Define the regex pattern to match\n",
-    "pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
-    "\n",
-    "# Iterate over the lines that match the pattern\n",
-    "for matches in re.finditer(pattern, contents):\n",
-    "    print(matches.group())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "\n",
-    "# Open the text file\n",
-    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
-    "    # Read the contents of the file\n",
-    "    contents = file.read()\n",
-    "\n",
-    "# Define the regex pattern to match\n",
-    "pattern = r'.*DANIEL.*|.*IZABELY.*'\n",
-    "\n",
-    "# Iterate over the lines that match the pattern\n",
-    "for matches in re.finditer(pattern, contents):\n",
-    "    print(matches.group())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "\n",
-    "# Open the text file\n",
-    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
-    "    # Read the contents of the file\n",
-    "    contents = file.read()\n",
-    "\n",
-    "# Define the regex patterns\n",
-    "dan_pattern = r'*DANIEL.*'\n",
-    "iza_pattern = r'.*IZABELY.*'\n",
-    "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
-    "\n",
-    "# Iterate over the lines that match the pattern\n",
-    "for matches in re.finditer(line_pattern, contents):\n",
-    "    print(matches.group())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Open the text file\n",
-    "with open('table-test.txt', 'r') as file:\n",
-    "    # Read the contents of the file\n",
-    "    contents = file.readlines()\n",
-    "\n",
-    "# Initialize lists to store the lines under each table\n",
-    "table_a_lines = []\n",
-    "table_b_lines = []\n",
-    "\n",
-    "# Flag to determine which table section we are in\n",
-    "current_table = None\n",
-    "\n",
-    "# Iterate over the lines in the file\n",
-    "for line in contents:\n",
-    "    line = line.strip()  # Remove leading and trailing whitespace\n",
-    "\n",
-    "    # Check for TABLEA and TABLEB\n",
-    "    if line == 'TABLEA':\n",
-    "        current_table = 'TABLEA'\n",
-    "    elif line == 'TABLEB':\n",
-    "        current_table = 'TABLEB'\n",
-    "    else:\n",
-    "        # Add lines to the appropriate list based on the current table\n",
-    "        if current_table == 'TABLEA':\n",
-    "            table_a_lines.append(line)\n",
-    "        elif current_table == 'TABLEB':\n",
-    "            table_b_lines.append(line)\n",
-    "\n",
-    "# Print the results\n",
-    "print('Lines under TABLEA:')\n",
-    "for data in table_a_lines:\n",
-    "    print(data)\n",
-    "\n",
-    "print('\\nLines under TABLEB:')\n",
-    "for data in table_b_lines:\n",
-    "    print(data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "from datetime import date, datetime\n",
-    "import locale\n",
-    "\n",
-    "locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')\n",
-    "\n",
-    "# Open the text file\n",
-    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n",
-    "    # Read the contents of the file\n",
-    "    contents = file.readlines()\n",
-    "\n",
-    "# Define the regex patterns\n",
-    "dan_pattern = r'1 - DANIEL.*'\n",
-    "iza_pattern = r'4 - IZABELY.*'\n",
-    "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
-    "line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n",
-    "\n",
-    "# Lists\n",
-    "list_dan = []\n",
-    "list_iza = []\n",
-    "current_list = None\n",
-    "\n",
-    "insert_bulk = []\n",
-    "\n",
-    "# Iterate all lines\n",
-    "for line in contents:\n",
-    "    line = line.strip()\n",
-    "    if re.match(dan_pattern, line):\n",
-    "        current_list = 'list_dan'\n",
-    "        print('found Dan')\n",
-    "    elif re.match(iza_pattern, line):\n",
-    "        current_list = 'list_iza'\n",
-    "        print('found Iza')\n",
-    "    else:\n",
-    "        if re.match(line_pattern, line):\n",
-    "            if current_list == 'list_dan':\n",
-    "                print(\"dan\", line)\n",
-    "                list_dan.append(line)\n",
-    "            if current_list == 'list_iza':\n",
-    "                print(\"iza\", line)\n",
-    "                list_iza.append(line)\n",
-    "\n",
-    "print('list_dan - tuples for insert')\n",
-    "for item in list_dan:\n",
-    "    matches = re.search(line_group_pattern, item)\n",
-    "    tTdate = str(date(int(matches.group(3)), int(matches.group(2)), int(matches.group(1))))\n",
-    "    tAccount = 1\n",
-    "    tMemo = matches.group(4)\n",
-    "    tCity = matches.group(5)\n",
-    "    tCountry = matches.group(6)\n",
-    "    tOutflow = matches.group(7).strip().replace(',', '.')\n",
-    "    tInflow = matches.group(8).strip().replace(',', '.')\n",
-    "    tOwner = 1\n",
-    "    tInstallments = 1\n",
-    "    tCreated = str(datetime.now(tz=None))\n",
-    "    tUpdated = None\n",
-    "    insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n",
-    "\n",
-    "print('list_dan - tuples for insert')\n",
-    "for item in list_iza:\n",
-    "    matches = re.search(line_group_pattern, item)\n",
-    "    tTdate = str(date(int(matches.group(3)), int(matches.group(2)), int(matches.group(1))))\n",
-    "    tAccount = 1\n",
-    "    tMemo = matches.group(4)\n",
-    "    tCity = matches.group(5)\n",
-    "    tCountry = matches.group(6)\n",
-    "    tOutflow = matches.group(7).strip().replace(',', '.')\n",
-    "    tInflow = matches.group(8).strip().replace(',', '.')\n",
-    "    tOwner = 2\n",
-    "    tInstallments = 1\n",
-    "    tCreated = str(datetime.now(tz=None))\n",
-    "    tUpdated = None\n",
-    "    insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "insert_query = \"INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_lists():\n",
-    "    import re\n",
-    "\n",
-    "    # Open the text file\n",
-    "    with open(\"OUROCARD_VISA_INFINITE-Ago_24.txt\", \"r\", encoding=\"latin\") as file:\n",
-    "        # Read the contents of the file\n",
-    "        contents = file.readlines()\n",
-    "\n",
-    "    # Define the regex patterns\n",
-    "    owner_pattern = r\"\\d\\s*-\\s*(\\w+)\"\n",
-    "    line_pattern = r\"\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*-?\\d*\\.?\\d+,\\d{2}\\s*\\d+,\\d{2}\"\n",
-    "    payment_pattern = (r\"\\d{2}\\.\\d{2}\\.\\d{4}PGTO.*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\")\n",
-    "\n",
-    "    # Lists\n",
-    "    current_list = None\n",
-    "    owner_list = []\n",
-    "    result = {}\n",
-    "\n",
-    "    silly_counter = 1\n",
-    "\n",
-    "    # Find Owners\n",
-    "    for line in contents:\n",
-    "        line = line.strip()\n",
-    "\n",
-    "        found_owners = re.findall(owner_pattern, line)\n",
-    "        if found_owners:\n",
-    "            for owner_name in found_owners:\n",
-    "                list_name = f\"list_{owner_name.lower()}\"\n",
-    "                owner_list.append(list_name)\n",
-    "                result[list_name] = {}\n",
-    "                result[list_name][\"owner_name\"] = owner_name\n",
-    "                result[list_name][\"owner_id\"] = silly_counter\n",
-    "                silly_counter = silly_counter + 1\n",
-    "\n",
-    "    for line in contents:\n",
-    "        line = line.strip()\n",
-    "\n",
-    "        if re.match(owner_pattern, line):\n",
-    "            found_owner = re.match(owner_pattern, line)\n",
-    "            owner_list = f\"list_{found_owner.group(1).lower()}\"\n",
-    "            current_list = owner_list\n",
-    "            result[current_list][\"tlist\"] = []\n",
-    "        else:\n",
-    "            if re.match(payment_pattern, line):\n",
-    "                result[current_list][\"tlist\"].append(line)\n",
-    "            elif re.match(line_pattern, line):\n",
-    "                result[current_list][\"tlist\"].append(line)\n",
-    "\n",
-    "    return result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(create_lists())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def build_insert(input_dict: dict, account: int):\n",
-    "    from datetime import date, datetime\n",
-    "    import re\n",
-    "\n",
-    "    insert_bulk = []\n",
-    "    line_group_pattern = r\"(\\d{2})\\.(\\d{2})\\.(\\d{4})((.+PARC (\\d+.)\\/(\\d+))(\\s.{12})|(.{23})(.{14}))(.{2})(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d*\\.?\\d+,\\d{2})\"\n",
-    "    payment_pattern = r\"(\\d{2})\\.(\\d{2})\\.(\\d{4})(PGTO DEBITO CONTA).*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\"\n",
-    "\n",
-    "    for key in input_dict:\n",
-    "        for item in input_dict[key][\"tlist\"]:\n",
-    "            # * check for payment\n",
-    "            matches = re.match(payment_pattern, item)\n",
-    "            if matches:\n",
-    "                tTdate = str(\n",
-    "                    date(\n",
-    "                        int(matches.group(3)),\n",
-    "                        int(matches.group(2)),\n",
-    "                        int(matches.group(1)),\n",
-    "                    )\n",
-    "                )\n",
-    "                tAccount = account\n",
-    "                tMemo = matches.group(4)\n",
-    "                tCity = None\n",
-    "                tCountry = None\n",
-    "                tOutflow = None\n",
-    "                tInflow = matches.group(5).strip().replace(\".\", \"\").replace(\",\", \".\")\n",
-    "                tOwner = input_dict[key][\"owner_id\"]\n",
-    "                tInstallmentNr = None\n",
-    "                tInstallmentTt = None\n",
-    "                tCreated = str(datetime.now(tz=None))\n",
-    "                tUpdated = None\n",
-    "            else:\n",
-    "                matches = re.match(line_group_pattern, item)\n",
-    "                tTdate = str(\n",
-    "                    date(\n",
-    "                        int(matches.group(3)),\n",
-    "                        int(matches.group(2)),\n",
-    "                        int(matches.group(1)),\n",
-    "                    )\n",
-    "                )\n",
-    "                tAccount = account\n",
-    "\n",
-    "                # * check for Installments\n",
-    "                if matches.group(5):\n",
-    "                    tMemo = matches.group(5)\n",
-    "                    tCity = matches.group(8)\n",
-    "                    tInstallmentNr = int(matches.group(6))\n",
-    "                    tInstallmentTt = int(matches.group(7))\n",
-    "                else:\n",
-    "                    tMemo = matches.group(9)\n",
-    "                    tCity = matches.group(10)\n",
-    "                    tInstallmentNr = 1\n",
-    "                    tInstallmentTt = None\n",
-    "\n",
-    "                tCountry = matches.group(11)\n",
-    "                tOutflow = matches.group(12).strip().replace(\".\", \"\").replace(\",\", \".\")\n",
-    "                tInflow = matches.group(13).strip().replace(\".\", \"\").replace(\",\", \".\")\n",
-    "                tOwner = input_dict[key][\"owner_id\"]\n",
-    "\n",
-    "                tCreated = str(datetime.now(tz=None))\n",
-    "                tUpdated = None\n",
-    "            insert_bulk.append(\n",
-    "                (\n",
-    "                    tTdate,\n",
-    "                    tAccount,\n",
-    "                    tMemo,\n",
-    "                    tCity,\n",
-    "                    tCountry,\n",
-    "                    tOutflow,\n",
-    "                    tInflow,\n",
-    "                    tOwner,\n",
-    "                    tInstallmentNr,\n",
-    "                    tInstallmentTt,\n",
-    "                    tCreated,\n",
-    "                    tUpdated,\n",
-    "                )\n",
-    "            )\n",
-    "\n",
-    "    return insert_bulk"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def db_insert(insert_bulk: list[tuple]):\n",
-    "    from mysql.connector import connect, Error\n",
-    "\n",
-    "    try:\n",
-    "        with connect(\n",
-    "            host=\"localhost\",\n",
-    "            user=\"root\",\n",
-    "            password=\"pleasehashapasswordomg\",\n",
-    "            database=\"default\",\n",
-    "        ) as connection:\n",
-    "            print(\"CONNECTED!\", connection)\n",
-    "            with connection.cursor() as cursor:\n",
-    "                cursor.executemany(insert_query, insert_bulk)\n",
-    "            connection.commit()\n",
-    "            print(\"DONE!\")\n",
-    "    except Error as e:\n",
-    "        print(e)\n",
-    "    finally:\n",
-    "        connection.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "db_insert(build_insert(create_lists(), 1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "build_insert(create_lists(), 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "create_lists()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dictTest = {\n",
-    "    \"owner1\": {\n",
-    "        \"owner_label\": \"foo\",\n",
-    "        \"owner_id\": 1,\n",
-    "        \"list1\": [\"thingies, thingies, 42\"],\n",
-    "    },\n",
-    "    \"owner2\": {\n",
-    "        \"owner_label\": \"bar\",\n",
-    "        \"owner_id\": 2,\n",
-    "        \"list1\": [\"thingies, thingies, 42\"],\n",
-    "    },\n",
-    "}\n",
-    "\n",
-    "for owner in dictTest:\n",
-    "    print(dictTest[owner][\"owner_id\"], dictTest[owner][\"owner_label\"])\n",
-    "    for item in dictTest[owner][\"list1\"]:\n",
-    "        print(item)\n",
-    "\n",
-    "dictTest[\"owner1\"][\"owner_label\"] = \"yadda\"\n",
-    "\n",
-    "for owner in dictTest:\n",
-    "    print(dictTest[owner][\"owner_id\"], dictTest[owner][\"owner_label\"])\n",
-    "    for item in dictTest[owner][\"list1\"]:\n",
-    "        print(item)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "param1 = \"foo\"\n",
-    "param2 = \"bar\"\n",
-    "testy = {}\n",
-    "testy[param1] = {}\n",
-    "testy[param1][param2] = [\"what\", \"when\", \"why\"]\n",
-    "testy[param1][\"number\"] = 1\n",
-    "\n",
-    "print(testy)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/pdfquery.py
+++ b/pdfquery.py
@ -1,11 +0,0 @@
-import pandas
-import pdfquery
-
-#read the PDF
-pdf = pdfquery.PDFQuery('cc.pdf')
-pdf.load()
-
-
-#convert the pdf to XML
-pdf.tree.write('cc.xml', pretty_print = True)
-pdf
--- a/robopato.sql
+++ b/robopato.sql
@ -1,44 +1,43 @@
 CREATE TABLE `ACCOUNTS` (
  `ID` integer PRIMARY KEY NOT NULL AUTO_INCREMENT,
  `NAME` varchar(20),
-  `CREATED` datetime,
+  `CREATED` datetime NOT NULL,
  `UPDATED` datetime
 );

 CREATE TABLE `TRANSACTION` (
  `ID` integer PRIMARY KEY NOT NULL AUTO_INCREMENT,
-  `TDATE` date,
-  `ACCOUNTID` integer,
-  `MEMO` varchar(30),
-  `CITY` varchar(20),
+  `TDATE` date NOT NULL,
+  `ACCOUNTID` integer NOT NULL,
+  `MEMO` varchar(50) NOT NULL,
  `COUNTRY` char(2),
  `OUTFLOW` decimal(20,2),
  `INFLOW` decimal(12,2),
  `OWNERID` integer,
  `INSTALLMENT_NR` integer,
  `INSTALLMENT_TT` integer,
-  `CREATED` datetime,
+  `CREATED` datetime NOT NULL,
  `UPDATED` datetime
 );

 CREATE TABLE `PAYEE` (
  `ID` integer PRIMARY KEY NOT NULL AUTO_INCREMENT,
  `NAME` varchar(20),
-  `CREATED` datetime,
+  `CREATED` datetime NOT NULL,
  `UPDATED` datetime
 );

 CREATE TABLE `OWNER` (
  `ID` integer PRIMARY KEY NOT NULL AUTO_INCREMENT,
  `NAME` varchar(20),
-  `CREATED` datetime,
+  `CREATED` datetime NOT NULL,
  `UPDATED` datetime
 );

 CREATE TABLE `CATEGORY` (
  `ID` integer PRIMARY KEY NOT NULL AUTO_INCREMENT,
  `NAME` varchar(20),
-  `CREATED` datetime,
+  `CREATED` datetime NOT NULL,
  `UPDATED` datetime
 );

@ -46,7 +45,7 @@ CREATE TABLE `SUBCATEGORY` (
  `ID` integer PRIMARY KEY NOT NULL AUTO_INCREMENT,
  `CATEGORYID` integer,
  `NAME` varchar(20),
-  `CREATED` datetime,
+  `CREATED` datetime NOT NULL,
  `UPDATED` datetime
 );

@ -57,7 +56,7 @@ CREATE TABLE `CATEGORIZED_TRANSACTIONS` (
  `PAYEEID` integer,
  `CATEGORYID` integer,
  `SUBCATEGORYID` integer,
-  `CREATED` datetime,
+  `CREATED` datetime NOT NULL,
  `UPDATED` datetime
 );