Compare commits

..

No commits in common. "main" and "v0.0.1" have entirely different histories.
main ... v0.0.1

8 changed files with 12 additions and 1890 deletions

2
.gitignore vendored
View file

@ -10,7 +10,6 @@ log
tmp
venv
venv.coverage
*.csv
*.db
*.doctrees
*.env
@ -21,7 +20,6 @@ venv.coverage
*.pyd
*.pyo
*.swp
*.txt
*.egg-info
_build
_version.py

View file

@ -1,17 +0,0 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

View file

@ -1,4 +1 @@
v0.2.1 Script to extract useful fields from CSV.
v0.2.0 Functions for finding useful fields in CSV.
v0.1.0 Setup scripts.
v0.0.1 Hubspot Parse.

View file

@ -1,33 +1,2 @@
# Hubspot Parse
Scripts for parsing Hubspot data with a goal towards migrations.
# Install
Thusly.
```
git clone https://code.libre.is/libre/hsparse
cd hsparse/
python -m venv venv
source venv/bin/activate
pip install poetry
poetry install
```
# Usage
```
$ hsparse-csv-contacts -h
usage: hsparse-csv-contacts [-h] [-d] [-e] [-f] [-n] csv_file
Parse Hubspot Contacts CSV Export
positional arguments:
csv_file Contacts CSV File
options:
-h, --help show this help message and exit
-d, --dump Dump CSV contents
-e, --empty List empty columns
-f, --fields Fields from CSV header
-n, --non_empty List number of non-empty values for each column
```

View file

@ -1,71 +0,0 @@
# MIT License
# Copyright (c) 2024 Jeff Moe
""" Read CSV and extract selected columns and write to new CVS"""
import csv
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Extract CSV Columns, Output CSV")
parser.add_argument("headers_file", help="Headers File", type=str)
parser.add_argument("input_csv", help="Input CSV File", type=str)
parser.add_argument("output_csv", help="Output CSV File", type=str)
args = parser.parse_args()
return args
def read_good_headers(filename):
"""Reads and returns the list of 'good' headers from a given file."""
with open(filename, "r") as file:
return [line.strip() for line in file.readlines()]
def filter_csv(input_csv, output_csv, good_headers):
"""Filters an input CSV based on the provided good headers and writes to output CSV."""
# Read the original CSV
with open(input_csv, mode="r", newline="", encoding="utf-8") as infile:
reader = csv.DictReader(infile)
# Get only the required fieldnames ('good' headers)
filtered_fieldnames = [
field for field in reader.fieldnames if field in good_headers
]
# Write to output CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
writer = csv.DictWriter(outfile, fieldnames=filtered_fieldnames)
# Write the header line (column names) first
writer.writeheader()
for row in reader:
filtered_row = {
key: value
for key, value in row.items()
if key in filtered_fieldnames
}
writer.writerow(filtered_row)
def main():
args = parse_args()
headers_file = args.headers_file
input_csv = args.input_csv
output_csv = args.output_csv
# Step 1: Read the list of good headers
good_headers = read_good_headers(headers_file)
# Step 2: Filter the CSV based on these headers and write to a new file
filter_csv(input_csv, output_csv, good_headers)
print(f"Filtered CSV has been written to {output_csv}")
if __name__ == "__main__":
main()

View file

@ -1,110 +1,14 @@
# MIT License
# Copyright (c) 2024 Jeff Moe
#!/usr/bin/env python3
''' Read CSV contacts file exported from hubspot.'''
""" Read CSV contacts file exported from hubspot."""
import argparse
import csv
import pandas as pd
CSV="all-contacts.csv"
def parse_args():
parser = argparse.ArgumentParser(description="Parse Hubspot Contacts CSV Export")
print("Parsing" + CSV)
parser.add_argument("csv_file", help="Contacts CSV File", type=str)
with open(CSV, newline='') as csvfile:
contactreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in contactreader:
print(', '.join(row))
parser.add_argument(
"-d",
"--dump",
help="Dump CSV contents",
action="store_true",
)
parser.add_argument(
"-e",
"--empty",
help="List empty columns",
action="store_true",
)
parser.add_argument(
"-f",
"--fields",
help="Fields from CSV header",
action="store_true",
)
parser.add_argument(
"-n",
"--non_empty",
help="List number of non-empty values for each column",
action="store_true",
)
args = parser.parse_args()
return args
def csv_dump(CSV):
df = pd.read_csv(CSV, low_memory=False, chunksize=1000)
for chunk in df:
print(chunk.to_string())
def csv_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
empty_columns = [col for col in df.columns if df[col].isnull().all()]
if empty_columns:
print("Empty columns:")
print("\n".join(empty_columns))
else:
print("No empty columns found.")
def csv_fields(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
print("\n".join([col for col in df.columns]))
def csv_non_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
non_empty_columns = {
col: df[col].count() for col in df.columns if not df[col].isnull().all()
}
unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
sorted_columns = sorted(
unique_counts.items(),
key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
reverse=True,
)
print("Column\tNon-empty values\tUnique values")
if sorted_columns:
for col, unique_count in sorted_columns:
count = non_empty_columns[col]
print(f"{col}\t{count}\t{unique_count}")
else:
print("No non-empty values found.")
def main():
args = parse_args()
CSV = args.csv_file
if args.dump:
csv_dump(CSV)
if args.empty:
csv_empty(CSV)
if args.fields:
csv_fields(CSV)
if args.non_empty:
csv_non_empty(CSV)
if __name__ == "__main__":
main()

1658
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -22,20 +22,20 @@ packages = [
{ include = "hsparse" },
]
readme = "README.md"
version = "0.2.1"
version = "0.0.1"
[tool.poetry.dependencies]
python = "^3.10"
setuptools_scm = "*"
pandas = "^2.2.2"
[build-system]
requires = ["poetry-core", "setuptools_scm"]
build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
hsparse-csv-contacts = "hsparse.parse_csv_contacts:main"
hsparse-extract-columns = "hsparse.extract_columns_to_csv:main"
hsparse = "hsparse.main:parse_csv_contacts"
[tool.poetry.urls]
homepage = "https://libre.is/libre/hsparse"