v0.2.1

rm old comments
Command line options for extract
2024-09-21 12:44:17 -06:00 · 2024-09-21 12:43:41 -06:00 · 2024-09-21 12:42:51 -06:00 · 2024-09-21 12:34:46 -06:00 · 2024-09-21 12:31:08 -06:00 · 2024-09-21 12:30:23 -06:00
8 changed files with 1890 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ log
 tmp
 venv
 venv.coverage
+*.csv
 *.db
 *.doctrees
 *.env
@ -20,6 +21,7 @@ venv.coverage
 *.pyd
 *.pyo
 *.swp
+*.txt
 *.egg-info
 _build
 _version.py
--- a/.theia/launch.json
+++ b/.theia/launch.json
@ -0,0 +1,17 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  "version": "0.2.0",
+  "configurations": [
+  
+    {
+      "name": "Python: Current File",
+      "type": "python",
+      "request": "launch",
+      "program": "${file}",
+      "console": "integratedTerminal",
+      "justMyCode": true
+    }
+  ]
+}
+
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@ -1 +1,4 @@
+v0.2.1         Script to extract useful fields from CSV.
+v0.2.0         Functions for finding useful fields in CSV.
+v0.1.0         Setup scripts.
 v0.0.1         Hubspot Parse.
--- a/README.md
+++ b/README.md
@ -1,2 +1,33 @@
 # Hubspot Parse
 Scripts for parsing Hubspot data with a goal towards migrations.
+
+# Install
+Thusly.
+
+```
+git clone https://code.libre.is/libre/hsparse
+cd hsparse/
+python -m venv venv
+source venv/bin/activate
+pip install poetry
+poetry install
+```
+
+# Usage
+```
+$ hsparse-csv-contacts -h
+usage: hsparse-csv-contacts [-h] [-d] [-e] [-f] [-n] csv_file
+
+Parse Hubspot Contacts CSV Export
+
+positional arguments:
+  csv_file         Contacts CSV File
+
+options:
+  -h, --help       show this help message and exit
+  -d, --dump       Dump CSV contents
+  -e, --empty      List empty columns
+  -f, --fields     Fields from CSV header
+  -n, --non_empty  List number of non-empty values for each column
+```
+
--- a/hsparse/extract_columns_to_csv.py
+++ b/hsparse/extract_columns_to_csv.py
@ -0,0 +1,71 @@
+# MIT License
+# Copyright (c) 2024 Jeff Moe
+""" Read CSV and extract selected columns and write to new CVS"""
+
+import csv
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Extract CSV Columns, Output CSV")
+
+    parser.add_argument("headers_file", help="Headers File", type=str)
+    parser.add_argument("input_csv", help="Input CSV File", type=str)
+    parser.add_argument("output_csv", help="Output CSV File", type=str)
+
+    args = parser.parse_args()
+    return args
+
+
+def read_good_headers(filename):
+    """Reads and returns the list of 'good' headers from a given file."""
+    with open(filename, "r") as file:
+        return [line.strip() for line in file.readlines()]
+
+
+def filter_csv(input_csv, output_csv, good_headers):
+    """Filters an input CSV based on the provided good headers and writes to output CSV."""
+
+    # Read the original CSV
+    with open(input_csv, mode="r", newline="", encoding="utf-8") as infile:
+        reader = csv.DictReader(infile)
+
+        # Get only the required fieldnames ('good' headers)
+        filtered_fieldnames = [
+            field for field in reader.fieldnames if field in good_headers
+        ]
+
+        # Write to output CSV
+        with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
+            writer = csv.DictWriter(outfile, fieldnames=filtered_fieldnames)
+
+            # Write the header line (column names) first
+            writer.writeheader()
+
+            for row in reader:
+                filtered_row = {
+                    key: value
+                    for key, value in row.items()
+                    if key in filtered_fieldnames
+                }
+                writer.writerow(filtered_row)
+
+
+def main():
+    args = parse_args()
+
+    headers_file = args.headers_file
+    input_csv = args.input_csv
+    output_csv = args.output_csv
+
+    # Step 1: Read the list of good headers
+    good_headers = read_good_headers(headers_file)
+
+    # Step 2: Filter the CSV based on these headers and write to a new file
+    filter_csv(input_csv, output_csv, good_headers)
+
+    print(f"Filtered CSV has been written to {output_csv}")
+
+
+if __name__ == "__main__":
+    main()
--- a/hsparse/parse_csv_contacts.py
+++ b/hsparse/parse_csv_contacts.py
@ -1,14 +1,110 @@
-#!/usr/bin/env python3
-''' Read CSV contacts file exported from hubspot.'''
+# MIT License
+# Copyright (c) 2024 Jeff Moe

+""" Read CSV contacts file exported from hubspot."""
+
+import argparse
 import csv
+import pandas as pd

-CSV="all-contacts.csv"

-print("Parsing" + CSV)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Parse Hubspot Contacts CSV Export")

-with open(CSV, newline='') as csvfile:
-    contactreader = csv.reader(csvfile, delimiter=',', quotechar='"')
-    for row in contactreader:
-        print(', '.join(row))
+    parser.add_argument("csv_file", help="Contacts CSV File", type=str)

+    parser.add_argument(
+        "-d",
+        "--dump",
+        help="Dump CSV contents",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "-e",
+        "--empty",
+        help="List empty columns",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--fields",
+        help="Fields from CSV header",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--non_empty",
+        help="List number of non-empty values for each column",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def csv_dump(CSV):
+    df = pd.read_csv(CSV, low_memory=False, chunksize=1000)
+    for chunk in df:
+        print(chunk.to_string())
+
+
+def csv_empty(CSV):
+    df = pd.read_csv(CSV, low_memory=False, header=0)
+    empty_columns = [col for col in df.columns if df[col].isnull().all()]
+    if empty_columns:
+        print("Empty columns:")
+        print("\n".join(empty_columns))
+    else:
+        print("No empty columns found.")
+
+
+def csv_fields(CSV):
+    df = pd.read_csv(CSV, low_memory=False, header=0)
+    print("\n".join([col for col in df.columns]))
+
+
+def csv_non_empty(CSV):
+    df = pd.read_csv(CSV, low_memory=False, header=0)
+    non_empty_columns = {
+        col: df[col].count() for col in df.columns if not df[col].isnull().all()
+    }
+
+    unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
+    sorted_columns = sorted(
+        unique_counts.items(),
+        key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
+        reverse=True,
+    )
+
+    print("Column\tNon-empty values\tUnique values")
+
+    if sorted_columns:
+        for col, unique_count in sorted_columns:
+            count = non_empty_columns[col]
+            print(f"{col}\t{count}\t{unique_count}")
+    else:
+        print("No non-empty values found.")
+
+
+def main():
+    args = parse_args()
+    CSV = args.csv_file
+
+    if args.dump:
+        csv_dump(CSV)
+
+    if args.empty:
+        csv_empty(CSV)
+
+    if args.fields:
+        csv_fields(CSV)
+
+    if args.non_empty:
+        csv_non_empty(CSV)
+
+
+if __name__ == "__main__":
+    main()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,20 +22,20 @@ packages = [
    { include = "hsparse" },
 ]
 readme = "README.md"
-version = "0.0.1"
-
+version = "0.2.1"

 [tool.poetry.dependencies]
 python = "^3.10"
 setuptools_scm = "*"
-
+pandas = "^2.2.2"

 [build-system]
 requires = ["poetry-core", "setuptools_scm"]
 build-backend = "poetry.core.masonry.api"

 [tool.poetry.scripts]
-hsparse = "hsparse.main:parse_csv_contacts"
+hsparse-csv-contacts = "hsparse.parse_csv_contacts:main"
+hsparse-extract-columns = "hsparse.extract_columns_to_csv:main"

 [tool.poetry.urls]
 homepage = "https://libre.is/libre/hsparse"
Author	SHA1	Message	Date
Jeff Moe	bd4f659c04	v0.2.1	2024-09-21 12:44:17 -06:00
Jeff Moe	9b4ce47caf	rm old comments	2024-09-21 12:43:41 -06:00
Jeff Moe	78752392eb	Command line options for extract	2024-09-21 12:42:51 -06:00
Jeff Moe	14c7c604fe	hsparse-extract-columns bin script	2024-09-21 12:34:46 -06:00
Jeff Moe	8a13aae58f	rename to extract not select	2024-09-21 12:31:08 -06:00
Jeff Moe	01117d4ee5	Draft script to select columns and write CSV	2024-09-21 12:30:23 -06:00
Jeff Moe	9a2662a22c	ignore .csv and .txt files	2024-09-21 12:21:54 -06:00
Jeff Moe	a1a0048135	Usage doc	2024-09-19 19:48:22 -06:00
Jeff Moe	3c9c6c97b9	Sort by unique counts	2024-09-19 19:46:30 -06:00
Jeff Moe	7dfdfb07df	v0.2.0	2024-09-19 19:36:11 -06:00
Jeff Moe	98671f7916	Read csv file without -c	2024-09-19 19:35:14 -06:00
Jeff Moe	e08dfc02f7	Count unique values per column	2024-09-19 19:26:23 -06:00
Jeff Moe	5b02a6198f	List how many times a field is used in a column	2024-09-19 19:15:40 -06:00
Jeff Moe	f2e145092a	List non-empty columns with number of hits	2024-09-19 19:12:02 -06:00
Jeff Moe	a92f57cbad	rm large header	2024-09-19 19:06:34 -06:00
Jeff Moe	cf58685e23	Print list of empty columns	2024-09-19 19:05:54 -06:00
Jeff Moe	e551aa88a8	Dump all rows	2024-09-19 19:00:27 -06:00
Jeff Moe	61d67961ee	Output CSV fields nicer	2024-09-19 18:44:51 -06:00
Jeff Moe	d1c9de1066	pandas dep	2024-09-19 18:03:28 -06:00
Jeff Moe	4f5841cbc3	Print CSV header	2024-09-19 17:51:39 -06:00
Jeff Moe	b891fa4823	theia, parrot python launcher	2024-09-19 17:51:26 -06:00
Jeff Moe	743226bbce	Rename to csv_dump	2024-09-19 12:40:53 -06:00
Jeff Moe	354bf1a1ff	Dump option	2024-09-19 12:39:37 -06:00
Jeff Moe	c1807e0d70	CLI argument parsing	2024-09-19 12:20:47 -06:00
Jeff Moe	7815babf19	HOWTO run hsparse-csv-contact	2024-09-19 12:20:30 -06:00
Jeff Moe	6bba38652b	v0.1.0	2024-09-19 12:14:51 -06:00
Jeff Moe	a9ecc3132c	Script setup	2024-09-19 12:14:27 -06:00
Jeff Moe	1762126e23	Poetry lock	2024-09-19 12:08:02 -06:00
Jeff Moe	b07a500610	Install notes	2024-09-19 12:07:37 -06:00