Compare commits

..

9 commits
v0.2.0 ... main

6 changed files with 97 additions and 8 deletions

2
.gitignore vendored
View file

@ -10,6 +10,7 @@ log
tmp
venv
venv.coverage
*.csv
*.db
*.doctrees
*.env
@ -20,6 +21,7 @@ venv.coverage
*.pyd
*.pyo
*.swp
*.txt
*.egg-info
_build
_version.py

View file

@ -1,3 +1,4 @@
v0.2.1 Script to extract useful fields from CSV.
v0.2.0 Functions for finding useful fields in CSV.
v0.1.0 Setup scripts.
v0.0.1 Hubspot Parse.

View file

@ -13,11 +13,21 @@ pip install poetry
poetry install
```
# Run
Thusly.
Example:
# Usage
```
hsparse-csv-contacts --csv ~/all-contacts.csv
$ hsparse-csv-contacts -h
usage: hsparse-csv-contacts [-h] [-d] [-e] [-f] [-n] csv_file
Parse Hubspot Contacts CSV Export
positional arguments:
csv_file Contacts CSV File
options:
-h, --help show this help message and exit
-d, --dump Dump CSV contents
-e, --empty List empty columns
-f, --fields Fields from CSV header
-n, --non_empty List number of non-empty values for each column
```

View file

@ -0,0 +1,71 @@
# MIT License
# Copyright (c) 2024 Jeff Moe
""" Read CSV and extract selected columns and write to new CVS"""
import csv
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Extract CSV Columns, Output CSV")
parser.add_argument("headers_file", help="Headers File", type=str)
parser.add_argument("input_csv", help="Input CSV File", type=str)
parser.add_argument("output_csv", help="Output CSV File", type=str)
args = parser.parse_args()
return args
def read_good_headers(filename):
"""Reads and returns the list of 'good' headers from a given file."""
with open(filename, "r") as file:
return [line.strip() for line in file.readlines()]
def filter_csv(input_csv, output_csv, good_headers):
"""Filters an input CSV based on the provided good headers and writes to output CSV."""
# Read the original CSV
with open(input_csv, mode="r", newline="", encoding="utf-8") as infile:
reader = csv.DictReader(infile)
# Get only the required fieldnames ('good' headers)
filtered_fieldnames = [
field for field in reader.fieldnames if field in good_headers
]
# Write to output CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
writer = csv.DictWriter(outfile, fieldnames=filtered_fieldnames)
# Write the header line (column names) first
writer.writeheader()
for row in reader:
filtered_row = {
key: value
for key, value in row.items()
if key in filtered_fieldnames
}
writer.writerow(filtered_row)
def main():
args = parse_args()
headers_file = args.headers_file
input_csv = args.input_csv
output_csv = args.output_csv
# Step 1: Read the list of good headers
good_headers = read_good_headers(headers_file)
# Step 2: Filter the CSV based on these headers and write to a new file
filter_csv(input_csv, output_csv, good_headers)
print(f"Filtered CSV has been written to {output_csv}")
if __name__ == "__main__":
main()

View file

@ -73,7 +73,11 @@ def csv_non_empty(CSV):
}
unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
sorted_columns = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
sorted_columns = sorted(
unique_counts.items(),
key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
reverse=True,
)
print("Column\tNon-empty values\tUnique values")

View file

@ -22,7 +22,7 @@ packages = [
{ include = "hsparse" },
]
readme = "README.md"
version = "0.2.0"
version = "0.2.1"
[tool.poetry.dependencies]
python = "^3.10"
@ -35,6 +35,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
hsparse-csv-contacts = "hsparse.parse_csv_contacts:main"
hsparse-extract-columns = "hsparse.extract_columns_to_csv:main"
[tool.poetry.urls]
homepage = "https://libre.is/libre/hsparse"