Compare commits
29 commits
Author | SHA1 | Date | |
---|---|---|---|
|
bd4f659c04 | ||
|
9b4ce47caf | ||
|
78752392eb | ||
|
14c7c604fe | ||
|
8a13aae58f | ||
|
01117d4ee5 | ||
|
9a2662a22c | ||
|
a1a0048135 | ||
|
3c9c6c97b9 | ||
|
7dfdfb07df | ||
|
98671f7916 | ||
|
e08dfc02f7 | ||
|
5b02a6198f | ||
|
f2e145092a | ||
|
a92f57cbad | ||
|
cf58685e23 | ||
|
e551aa88a8 | ||
|
61d67961ee | ||
|
d1c9de1066 | ||
|
4f5841cbc3 | ||
|
b891fa4823 | ||
|
743226bbce | ||
|
354bf1a1ff | ||
|
c1807e0d70 | ||
|
7815babf19 | ||
|
6bba38652b | ||
|
a9ecc3132c | ||
|
1762126e23 | ||
|
b07a500610 |
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -10,6 +10,7 @@ log
|
||||||
tmp
|
tmp
|
||||||
venv
|
venv
|
||||||
venv.coverage
|
venv.coverage
|
||||||
|
*.csv
|
||||||
*.db
|
*.db
|
||||||
*.doctrees
|
*.doctrees
|
||||||
*.env
|
*.env
|
||||||
|
@ -20,6 +21,7 @@ venv.coverage
|
||||||
*.pyd
|
*.pyd
|
||||||
*.pyo
|
*.pyo
|
||||||
*.swp
|
*.swp
|
||||||
|
*.txt
|
||||||
*.egg-info
|
*.egg-info
|
||||||
_build
|
_build
|
||||||
_version.py
|
_version.py
|
||||||
|
|
17
.theia/launch.json
Normal file
17
.theia/launch.json
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "Python: Current File",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${file}",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
|
@ -1 +1,4 @@
|
||||||
|
v0.2.1 Script to extract useful fields from CSV.
|
||||||
|
v0.2.0 Functions for finding useful fields in CSV.
|
||||||
|
v0.1.0 Setup scripts.
|
||||||
v0.0.1 Hubspot Parse.
|
v0.0.1 Hubspot Parse.
|
||||||
|
|
31
README.md
31
README.md
|
@ -1,2 +1,33 @@
|
||||||
# Hubspot Parse
|
# Hubspot Parse
|
||||||
Scripts for parsing Hubspot data with a goal towards migrations.
|
Scripts for parsing Hubspot data with a goal towards migrations.
|
||||||
|
|
||||||
|
# Install
|
||||||
|
Thusly.
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone https://code.libre.is/libre/hsparse
|
||||||
|
cd hsparse/
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install poetry
|
||||||
|
poetry install
|
||||||
|
```
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
```
|
||||||
|
$ hsparse-csv-contacts -h
|
||||||
|
usage: hsparse-csv-contacts [-h] [-d] [-e] [-f] [-n] csv_file
|
||||||
|
|
||||||
|
Parse Hubspot Contacts CSV Export
|
||||||
|
|
||||||
|
positional arguments:
|
||||||
|
csv_file Contacts CSV File
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-d, --dump Dump CSV contents
|
||||||
|
-e, --empty List empty columns
|
||||||
|
-f, --fields Fields from CSV header
|
||||||
|
-n, --non_empty List number of non-empty values for each column
|
||||||
|
```
|
||||||
|
|
||||||
|
|
71
hsparse/extract_columns_to_csv.py
Normal file
71
hsparse/extract_columns_to_csv.py
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
# MIT License
|
||||||
|
# Copyright (c) 2024 Jeff Moe
|
||||||
|
""" Read CSV and extract selected columns and write to new CVS"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Extract CSV Columns, Output CSV")
|
||||||
|
|
||||||
|
parser.add_argument("headers_file", help="Headers File", type=str)
|
||||||
|
parser.add_argument("input_csv", help="Input CSV File", type=str)
|
||||||
|
parser.add_argument("output_csv", help="Output CSV File", type=str)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def read_good_headers(filename):
|
||||||
|
"""Reads and returns the list of 'good' headers from a given file."""
|
||||||
|
with open(filename, "r") as file:
|
||||||
|
return [line.strip() for line in file.readlines()]
|
||||||
|
|
||||||
|
|
||||||
|
def filter_csv(input_csv, output_csv, good_headers):
|
||||||
|
"""Filters an input CSV based on the provided good headers and writes to output CSV."""
|
||||||
|
|
||||||
|
# Read the original CSV
|
||||||
|
with open(input_csv, mode="r", newline="", encoding="utf-8") as infile:
|
||||||
|
reader = csv.DictReader(infile)
|
||||||
|
|
||||||
|
# Get only the required fieldnames ('good' headers)
|
||||||
|
filtered_fieldnames = [
|
||||||
|
field for field in reader.fieldnames if field in good_headers
|
||||||
|
]
|
||||||
|
|
||||||
|
# Write to output CSV
|
||||||
|
with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
|
||||||
|
writer = csv.DictWriter(outfile, fieldnames=filtered_fieldnames)
|
||||||
|
|
||||||
|
# Write the header line (column names) first
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
filtered_row = {
|
||||||
|
key: value
|
||||||
|
for key, value in row.items()
|
||||||
|
if key in filtered_fieldnames
|
||||||
|
}
|
||||||
|
writer.writerow(filtered_row)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
headers_file = args.headers_file
|
||||||
|
input_csv = args.input_csv
|
||||||
|
output_csv = args.output_csv
|
||||||
|
|
||||||
|
# Step 1: Read the list of good headers
|
||||||
|
good_headers = read_good_headers(headers_file)
|
||||||
|
|
||||||
|
# Step 2: Filter the CSV based on these headers and write to a new file
|
||||||
|
filter_csv(input_csv, output_csv, good_headers)
|
||||||
|
|
||||||
|
print(f"Filtered CSV has been written to {output_csv}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -1,14 +1,110 @@
|
||||||
#!/usr/bin/env python3
|
# MIT License
|
||||||
''' Read CSV contacts file exported from hubspot.'''
|
# Copyright (c) 2024 Jeff Moe
|
||||||
|
|
||||||
|
""" Read CSV contacts file exported from hubspot."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
import csv
|
import csv
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
CSV="all-contacts.csv"
|
|
||||||
|
|
||||||
print("Parsing" + CSV)
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Parse Hubspot Contacts CSV Export")
|
||||||
|
|
||||||
with open(CSV, newline='') as csvfile:
|
parser.add_argument("csv_file", help="Contacts CSV File", type=str)
|
||||||
contactreader = csv.reader(csvfile, delimiter=',', quotechar='"')
|
|
||||||
for row in contactreader:
|
|
||||||
print(', '.join(row))
|
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--dump",
|
||||||
|
help="Dump CSV contents",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--empty",
|
||||||
|
help="List empty columns",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-f",
|
||||||
|
"--fields",
|
||||||
|
help="Fields from CSV header",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--non_empty",
|
||||||
|
help="List number of non-empty values for each column",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def csv_dump(CSV):
|
||||||
|
df = pd.read_csv(CSV, low_memory=False, chunksize=1000)
|
||||||
|
for chunk in df:
|
||||||
|
print(chunk.to_string())
|
||||||
|
|
||||||
|
|
||||||
|
def csv_empty(CSV):
|
||||||
|
df = pd.read_csv(CSV, low_memory=False, header=0)
|
||||||
|
empty_columns = [col for col in df.columns if df[col].isnull().all()]
|
||||||
|
if empty_columns:
|
||||||
|
print("Empty columns:")
|
||||||
|
print("\n".join(empty_columns))
|
||||||
|
else:
|
||||||
|
print("No empty columns found.")
|
||||||
|
|
||||||
|
|
||||||
|
def csv_fields(CSV):
|
||||||
|
df = pd.read_csv(CSV, low_memory=False, header=0)
|
||||||
|
print("\n".join([col for col in df.columns]))
|
||||||
|
|
||||||
|
|
||||||
|
def csv_non_empty(CSV):
|
||||||
|
df = pd.read_csv(CSV, low_memory=False, header=0)
|
||||||
|
non_empty_columns = {
|
||||||
|
col: df[col].count() for col in df.columns if not df[col].isnull().all()
|
||||||
|
}
|
||||||
|
|
||||||
|
unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
|
||||||
|
sorted_columns = sorted(
|
||||||
|
unique_counts.items(),
|
||||||
|
key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Column\tNon-empty values\tUnique values")
|
||||||
|
|
||||||
|
if sorted_columns:
|
||||||
|
for col, unique_count in sorted_columns:
|
||||||
|
count = non_empty_columns[col]
|
||||||
|
print(f"{col}\t{count}\t{unique_count}")
|
||||||
|
else:
|
||||||
|
print("No non-empty values found.")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
CSV = args.csv_file
|
||||||
|
|
||||||
|
if args.dump:
|
||||||
|
csv_dump(CSV)
|
||||||
|
|
||||||
|
if args.empty:
|
||||||
|
csv_empty(CSV)
|
||||||
|
|
||||||
|
if args.fields:
|
||||||
|
csv_fields(CSV)
|
||||||
|
|
||||||
|
if args.non_empty:
|
||||||
|
csv_non_empty(CSV)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
1658
poetry.lock
generated
Normal file
1658
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
|
@ -22,20 +22,20 @@ packages = [
|
||||||
{ include = "hsparse" },
|
{ include = "hsparse" },
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
version = "0.0.1"
|
version = "0.2.1"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
setuptools_scm = "*"
|
setuptools_scm = "*"
|
||||||
|
pandas = "^2.2.2"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core", "setuptools_scm"]
|
requires = ["poetry-core", "setuptools_scm"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
hsparse = "hsparse.main:parse_csv_contacts"
|
hsparse-csv-contacts = "hsparse.parse_csv_contacts:main"
|
||||||
|
hsparse-extract-columns = "hsparse.extract_columns_to_csv:main"
|
||||||
|
|
||||||
[tool.poetry.urls]
|
[tool.poetry.urls]
|
||||||
homepage = "https://libre.is/libre/hsparse"
|
homepage = "https://libre.is/libre/hsparse"
|
||||||
|
|
Loading…
Reference in a new issue