v0.2.1

rm old comments
Command line options for extract
2024-09-21 12:44:17 -06:00 · 2024-09-21 12:43:41 -06:00 · 2024-09-21 12:42:51 -06:00 · 2024-09-21 12:34:46 -06:00 · 2024-09-21 12:31:08 -06:00 · 2024-09-21 12:30:23 -06:00
6 changed files with 97 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ log
 tmp
 venv
 venv.coverage
 *.csv
 *.db
 *.doctrees
 *.env
@ -20,6 +21,7 @@ venv.coverage
 *.pyd
 *.pyo
 *.swp
 *.txt
 *.egg-info
 _build
 _version.py
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@ -1,3 +1,4 @@
 v0.2.1         Script to extract useful fields from CSV.
 v0.2.0         Functions for finding useful fields in CSV.
 v0.1.0         Setup scripts.
 v0.0.1         Hubspot Parse.
--- a/README.md
+++ b/README.md
@ -13,11 +13,21 @@ pip install poetry
 poetry install
 ```
-# Run
+# Usage
 Thusly.
 Example:
 ```
-hsparse-csv-contacts --csv ~/all-contacts.csv
+$ hsparse-csv-contacts -h
 usage: hsparse-csv-contacts [-h] [-d] [-e] [-f] [-n] csv_file
 Parse Hubspot Contacts CSV Export
 positional arguments:
  csv_file         Contacts CSV File
 options:
  -h, --help       show this help message and exit
  -d, --dump       Dump CSV contents
  -e, --empty      List empty columns
  -f, --fields     Fields from CSV header
  -n, --non_empty  List number of non-empty values for each column
 ```
--- a/hsparse/extract_columns_to_csv.py
+++ b/hsparse/extract_columns_to_csv.py
@ -0,0 +1,71 @@
 # MIT License
 # Copyright (c) 2024 Jeff Moe
 """ Read CSV and extract selected columns and write to new CVS"""
 import csv
 import argparse
 def parse_args():
    parser = argparse.ArgumentParser(description="Extract CSV Columns, Output CSV")
    parser.add_argument("headers_file", help="Headers File", type=str)
    parser.add_argument("input_csv", help="Input CSV File", type=str)
    parser.add_argument("output_csv", help="Output CSV File", type=str)
    args = parser.parse_args()
    return args
 def read_good_headers(filename):
    """Reads and returns the list of 'good' headers from a given file."""
    with open(filename, "r") as file:
        return [line.strip() for line in file.readlines()]
 def filter_csv(input_csv, output_csv, good_headers):
    """Filters an input CSV based on the provided good headers and writes to output CSV."""
    # Read the original CSV
    with open(input_csv, mode="r", newline="", encoding="utf-8") as infile:
        reader = csv.DictReader(infile)
        # Get only the required fieldnames ('good' headers)
        filtered_fieldnames = [
            field for field in reader.fieldnames if field in good_headers
        ]
        # Write to output CSV
        with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
            writer = csv.DictWriter(outfile, fieldnames=filtered_fieldnames)
            # Write the header line (column names) first
            writer.writeheader()
            for row in reader:
                filtered_row = {
                    key: value
                    for key, value in row.items()
                    if key in filtered_fieldnames
                }
                writer.writerow(filtered_row)
 def main():
    args = parse_args()
    headers_file = args.headers_file
    input_csv = args.input_csv
    output_csv = args.output_csv
    # Step 1: Read the list of good headers
    good_headers = read_good_headers(headers_file)
    # Step 2: Filter the CSV based on these headers and write to a new file
    filter_csv(input_csv, output_csv, good_headers)
    print(f"Filtered CSV has been written to {output_csv}")
 if __name__ == "__main__":
    main()
--- a/hsparse/parse_csv_contacts.py
+++ b/hsparse/parse_csv_contacts.py
@ -73,7 +73,11 @@ def csv_non_empty(CSV):
    }
    unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
-    sorted_columns = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
+    sorted_columns = sorted(
        unique_counts.items(),
        key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
        reverse=True,
    )
    print("Column\tNon-empty values\tUnique values")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,7 +22,7 @@ packages = [
    { include = "hsparse" },
 ]
 readme = "README.md"
-version = "0.2.0"
+version = "0.2.1"
 [tool.poetry.dependencies]
 python = "^3.10"
@ -35,6 +35,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry.scripts]
 hsparse-csv-contacts = "hsparse.parse_csv_contacts:main"
 hsparse-extract-columns = "hsparse.extract_columns_to_csv:main"
 [tool.poetry.urls]
 homepage = "https://libre.is/libre/hsparse"
Author	SHA1	Message	Date
Jeff Moe	bd4f659c04	v0.2.1	2024-09-21 12:44:17 -06:00
Jeff Moe	9b4ce47caf	rm old comments	2024-09-21 12:43:41 -06:00
Jeff Moe	78752392eb	Command line options for extract	2024-09-21 12:42:51 -06:00
Jeff Moe	14c7c604fe	hsparse-extract-columns bin script	2024-09-21 12:34:46 -06:00
Jeff Moe	8a13aae58f	rename to extract not select	2024-09-21 12:31:08 -06:00
Jeff Moe	01117d4ee5	Draft script to select columns and write CSV	2024-09-21 12:30:23 -06:00
Jeff Moe	9a2662a22c	ignore .csv and .txt files	2024-09-21 12:21:54 -06:00
Jeff Moe	a1a0048135	Usage doc	2024-09-19 19:48:22 -06:00
Jeff Moe	3c9c6c97b9	Sort by unique counts	2024-09-19 19:46:30 -06:00