Compare commits

..

29 commits
v0.0.1 ... main

Author SHA1 Message Date
Jeff Moe bd4f659c04 v0.2.1 2024-09-21 12:44:17 -06:00
Jeff Moe 9b4ce47caf rm old comments 2024-09-21 12:43:41 -06:00
Jeff Moe 78752392eb Command line options for extract 2024-09-21 12:42:51 -06:00
Jeff Moe 14c7c604fe hsparse-extract-columns bin script 2024-09-21 12:34:46 -06:00
Jeff Moe 8a13aae58f rename to extract not select 2024-09-21 12:31:08 -06:00
Jeff Moe 01117d4ee5 Draft script to select columns and write CSV 2024-09-21 12:30:23 -06:00
Jeff Moe 9a2662a22c ignore .csv and .txt files 2024-09-21 12:21:54 -06:00
Jeff Moe a1a0048135 Usage doc 2024-09-19 19:48:22 -06:00
Jeff Moe 3c9c6c97b9 Sort by unique counts 2024-09-19 19:46:30 -06:00
Jeff Moe 7dfdfb07df v0.2.0 2024-09-19 19:36:11 -06:00
Jeff Moe 98671f7916 Read csv file without -c 2024-09-19 19:35:14 -06:00
Jeff Moe e08dfc02f7 Count unique values per column 2024-09-19 19:26:23 -06:00
Jeff Moe 5b02a6198f List how many times a field is used in a column 2024-09-19 19:15:40 -06:00
Jeff Moe f2e145092a List non-empty columns with number of hits 2024-09-19 19:12:02 -06:00
Jeff Moe a92f57cbad rm large header 2024-09-19 19:06:34 -06:00
Jeff Moe cf58685e23 Print list of empty columns 2024-09-19 19:05:54 -06:00
Jeff Moe e551aa88a8 Dump all rows 2024-09-19 19:00:27 -06:00
Jeff Moe 61d67961ee Output CSV fields nicer 2024-09-19 18:44:51 -06:00
Jeff Moe d1c9de1066 pandas dep 2024-09-19 18:03:28 -06:00
Jeff Moe 4f5841cbc3 Print CSV header 2024-09-19 17:51:39 -06:00
Jeff Moe b891fa4823 theia, parrot python launcher 2024-09-19 17:51:26 -06:00
Jeff Moe 743226bbce Rename to csv_dump 2024-09-19 12:40:53 -06:00
Jeff Moe 354bf1a1ff Dump option 2024-09-19 12:39:37 -06:00
Jeff Moe c1807e0d70 CLI argument parsing 2024-09-19 12:20:47 -06:00
Jeff Moe 7815babf19 HOWTO run hsparse-csv-contact 2024-09-19 12:20:30 -06:00
Jeff Moe 6bba38652b v0.1.0 2024-09-19 12:14:51 -06:00
Jeff Moe a9ecc3132c Script setup 2024-09-19 12:14:27 -06:00
Jeff Moe 1762126e23 Poetry lock 2024-09-19 12:08:02 -06:00
Jeff Moe b07a500610 Install notes 2024-09-19 12:07:37 -06:00
8 changed files with 1890 additions and 12 deletions

2
.gitignore vendored
View file

@ -10,6 +10,7 @@ log
tmp
venv
venv.coverage
*.csv
*.db
*.doctrees
*.env
@ -20,6 +21,7 @@ venv.coverage
*.pyd
*.pyo
*.swp
*.txt
*.egg-info
_build
_version.py

17
.theia/launch.json Normal file
View file

@ -0,0 +1,17 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

View file

@ -1 +1,4 @@
v0.2.1 Script to extract useful fields from CSV.
v0.2.0 Functions for finding useful fields in CSV.
v0.1.0 Setup scripts.
v0.0.1 Hubspot Parse.

View file

@ -1,2 +1,33 @@
# Hubspot Parse
Scripts for parsing Hubspot data with a goal towards migrations.
# Install
Thusly.
```
git clone https://code.libre.is/libre/hsparse
cd hsparse/
python -m venv venv
source venv/bin/activate
pip install poetry
poetry install
```
# Usage
```
$ hsparse-csv-contacts -h
usage: hsparse-csv-contacts [-h] [-d] [-e] [-f] [-n] csv_file
Parse Hubspot Contacts CSV Export
positional arguments:
csv_file Contacts CSV File
options:
-h, --help show this help message and exit
-d, --dump Dump CSV contents
-e, --empty List empty columns
-f, --fields Fields from CSV header
-n, --non_empty List number of non-empty values for each column
```

View file

@ -0,0 +1,71 @@
# MIT License
# Copyright (c) 2024 Jeff Moe
""" Read CSV and extract selected columns and write to new CVS"""
import csv
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Extract CSV Columns, Output CSV")
parser.add_argument("headers_file", help="Headers File", type=str)
parser.add_argument("input_csv", help="Input CSV File", type=str)
parser.add_argument("output_csv", help="Output CSV File", type=str)
args = parser.parse_args()
return args
def read_good_headers(filename):
"""Reads and returns the list of 'good' headers from a given file."""
with open(filename, "r") as file:
return [line.strip() for line in file.readlines()]
def filter_csv(input_csv, output_csv, good_headers):
"""Filters an input CSV based on the provided good headers and writes to output CSV."""
# Read the original CSV
with open(input_csv, mode="r", newline="", encoding="utf-8") as infile:
reader = csv.DictReader(infile)
# Get only the required fieldnames ('good' headers)
filtered_fieldnames = [
field for field in reader.fieldnames if field in good_headers
]
# Write to output CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
writer = csv.DictWriter(outfile, fieldnames=filtered_fieldnames)
# Write the header line (column names) first
writer.writeheader()
for row in reader:
filtered_row = {
key: value
for key, value in row.items()
if key in filtered_fieldnames
}
writer.writerow(filtered_row)
def main():
args = parse_args()
headers_file = args.headers_file
input_csv = args.input_csv
output_csv = args.output_csv
# Step 1: Read the list of good headers
good_headers = read_good_headers(headers_file)
# Step 2: Filter the CSV based on these headers and write to a new file
filter_csv(input_csv, output_csv, good_headers)
print(f"Filtered CSV has been written to {output_csv}")
if __name__ == "__main__":
main()

View file

@ -1,14 +1,110 @@
#!/usr/bin/env python3
''' Read CSV contacts file exported from hubspot.'''
# MIT License
# Copyright (c) 2024 Jeff Moe
""" Read CSV contacts file exported from hubspot."""
import argparse
import csv
import pandas as pd
CSV="all-contacts.csv"
print("Parsing" + CSV)
def parse_args():
parser = argparse.ArgumentParser(description="Parse Hubspot Contacts CSV Export")
with open(CSV, newline='') as csvfile:
contactreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in contactreader:
print(', '.join(row))
parser.add_argument("csv_file", help="Contacts CSV File", type=str)
parser.add_argument(
"-d",
"--dump",
help="Dump CSV contents",
action="store_true",
)
parser.add_argument(
"-e",
"--empty",
help="List empty columns",
action="store_true",
)
parser.add_argument(
"-f",
"--fields",
help="Fields from CSV header",
action="store_true",
)
parser.add_argument(
"-n",
"--non_empty",
help="List number of non-empty values for each column",
action="store_true",
)
args = parser.parse_args()
return args
def csv_dump(CSV):
df = pd.read_csv(CSV, low_memory=False, chunksize=1000)
for chunk in df:
print(chunk.to_string())
def csv_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
empty_columns = [col for col in df.columns if df[col].isnull().all()]
if empty_columns:
print("Empty columns:")
print("\n".join(empty_columns))
else:
print("No empty columns found.")
def csv_fields(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
print("\n".join([col for col in df.columns]))
def csv_non_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
non_empty_columns = {
col: df[col].count() for col in df.columns if not df[col].isnull().all()
}
unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
sorted_columns = sorted(
unique_counts.items(),
key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
reverse=True,
)
print("Column\tNon-empty values\tUnique values")
if sorted_columns:
for col, unique_count in sorted_columns:
count = non_empty_columns[col]
print(f"{col}\t{count}\t{unique_count}")
else:
print("No non-empty values found.")
def main():
args = parse_args()
CSV = args.csv_file
if args.dump:
csv_dump(CSV)
if args.empty:
csv_empty(CSV)
if args.fields:
csv_fields(CSV)
if args.non_empty:
csv_non_empty(CSV)
if __name__ == "__main__":
main()

1658
poetry.lock generated Normal file

File diff suppressed because it is too large Load diff

View file

@ -22,20 +22,20 @@ packages = [
{ include = "hsparse" },
]
readme = "README.md"
version = "0.0.1"
version = "0.2.1"
[tool.poetry.dependencies]
python = "^3.10"
setuptools_scm = "*"
pandas = "^2.2.2"
[build-system]
requires = ["poetry-core", "setuptools_scm"]
build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
hsparse = "hsparse.main:parse_csv_contacts"
hsparse-csv-contacts = "hsparse.parse_csv_contacts:main"
hsparse-extract-columns = "hsparse.extract_columns_to_csv:main"
[tool.poetry.urls]
homepage = "https://libre.is/libre/hsparse"