Compare commits

..

No commits in common. "main" and "v0.0.1" have entirely different histories.
main ... v0.0.1

8 changed files with 12 additions and 1890 deletions

2
.gitignore vendored
View file

@ -10,7 +10,6 @@ log
tmp tmp
venv venv
venv.coverage venv.coverage
*.csv
*.db *.db
*.doctrees *.doctrees
*.env *.env
@ -21,7 +20,6 @@ venv.coverage
*.pyd *.pyd
*.pyo *.pyo
*.swp *.swp
*.txt
*.egg-info *.egg-info
_build _build
_version.py _version.py

View file

@ -1,17 +0,0 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

View file

@ -1,4 +1 @@
v0.2.1 Script to extract useful fields from CSV.
v0.2.0 Functions for finding useful fields in CSV.
v0.1.0 Setup scripts.
v0.0.1 Hubspot Parse. v0.0.1 Hubspot Parse.

View file

@ -1,33 +1,2 @@
# Hubspot Parse # Hubspot Parse
Scripts for parsing Hubspot data with a goal towards migrations. Scripts for parsing Hubspot data with a goal towards migrations.
# Install
Thusly.
```
git clone https://code.libre.is/libre/hsparse
cd hsparse/
python -m venv venv
source venv/bin/activate
pip install poetry
poetry install
```
# Usage
```
$ hsparse-csv-contacts -h
usage: hsparse-csv-contacts [-h] [-d] [-e] [-f] [-n] csv_file
Parse Hubspot Contacts CSV Export
positional arguments:
csv_file Contacts CSV File
options:
-h, --help show this help message and exit
-d, --dump Dump CSV contents
-e, --empty List empty columns
-f, --fields Fields from CSV header
-n, --non_empty List number of non-empty values for each column
```

View file

@ -1,71 +0,0 @@
# MIT License
# Copyright (c) 2024 Jeff Moe
""" Read CSV and extract selected columns and write to new CVS"""
import csv
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Extract CSV Columns, Output CSV")
parser.add_argument("headers_file", help="Headers File", type=str)
parser.add_argument("input_csv", help="Input CSV File", type=str)
parser.add_argument("output_csv", help="Output CSV File", type=str)
args = parser.parse_args()
return args
def read_good_headers(filename):
"""Reads and returns the list of 'good' headers from a given file."""
with open(filename, "r") as file:
return [line.strip() for line in file.readlines()]
def filter_csv(input_csv, output_csv, good_headers):
"""Filters an input CSV based on the provided good headers and writes to output CSV."""
# Read the original CSV
with open(input_csv, mode="r", newline="", encoding="utf-8") as infile:
reader = csv.DictReader(infile)
# Get only the required fieldnames ('good' headers)
filtered_fieldnames = [
field for field in reader.fieldnames if field in good_headers
]
# Write to output CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
writer = csv.DictWriter(outfile, fieldnames=filtered_fieldnames)
# Write the header line (column names) first
writer.writeheader()
for row in reader:
filtered_row = {
key: value
for key, value in row.items()
if key in filtered_fieldnames
}
writer.writerow(filtered_row)
def main():
args = parse_args()
headers_file = args.headers_file
input_csv = args.input_csv
output_csv = args.output_csv
# Step 1: Read the list of good headers
good_headers = read_good_headers(headers_file)
# Step 2: Filter the CSV based on these headers and write to a new file
filter_csv(input_csv, output_csv, good_headers)
print(f"Filtered CSV has been written to {output_csv}")
if __name__ == "__main__":
main()

View file

@ -1,110 +1,14 @@
# MIT License #!/usr/bin/env python3
# Copyright (c) 2024 Jeff Moe ''' Read CSV contacts file exported from hubspot.'''
""" Read CSV contacts file exported from hubspot."""
import argparse
import csv import csv
import pandas as pd
CSV="all-contacts.csv"
def parse_args(): print("Parsing" + CSV)
parser = argparse.ArgumentParser(description="Parse Hubspot Contacts CSV Export")
parser.add_argument("csv_file", help="Contacts CSV File", type=str) with open(CSV, newline='') as csvfile:
contactreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in contactreader:
print(', '.join(row))
parser.add_argument(
"-d",
"--dump",
help="Dump CSV contents",
action="store_true",
)
parser.add_argument(
"-e",
"--empty",
help="List empty columns",
action="store_true",
)
parser.add_argument(
"-f",
"--fields",
help="Fields from CSV header",
action="store_true",
)
parser.add_argument(
"-n",
"--non_empty",
help="List number of non-empty values for each column",
action="store_true",
)
args = parser.parse_args()
return args
def csv_dump(CSV):
df = pd.read_csv(CSV, low_memory=False, chunksize=1000)
for chunk in df:
print(chunk.to_string())
def csv_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
empty_columns = [col for col in df.columns if df[col].isnull().all()]
if empty_columns:
print("Empty columns:")
print("\n".join(empty_columns))
else:
print("No empty columns found.")
def csv_fields(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
print("\n".join([col for col in df.columns]))
def csv_non_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
non_empty_columns = {
col: df[col].count() for col in df.columns if not df[col].isnull().all()
}
unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
sorted_columns = sorted(
unique_counts.items(),
key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
reverse=True,
)
print("Column\tNon-empty values\tUnique values")
if sorted_columns:
for col, unique_count in sorted_columns:
count = non_empty_columns[col]
print(f"{col}\t{count}\t{unique_count}")
else:
print("No non-empty values found.")
def main():
args = parse_args()
CSV = args.csv_file
if args.dump:
csv_dump(CSV)
if args.empty:
csv_empty(CSV)
if args.fields:
csv_fields(CSV)
if args.non_empty:
csv_non_empty(CSV)
if __name__ == "__main__":
main()

1658
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -22,20 +22,20 @@ packages = [
{ include = "hsparse" }, { include = "hsparse" },
] ]
readme = "README.md" readme = "README.md"
version = "0.2.1" version = "0.0.1"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"
setuptools_scm = "*" setuptools_scm = "*"
pandas = "^2.2.2"
[build-system] [build-system]
requires = ["poetry-core", "setuptools_scm"] requires = ["poetry-core", "setuptools_scm"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts] [tool.poetry.scripts]
hsparse-csv-contacts = "hsparse.parse_csv_contacts:main" hsparse = "hsparse.main:parse_csv_contacts"
hsparse-extract-columns = "hsparse.extract_columns_to_csv:main"
[tool.poetry.urls] [tool.poetry.urls]
homepage = "https://libre.is/libre/hsparse" homepage = "https://libre.is/libre/hsparse"