hsparse/hsparse/parse_csv_contacts.py
2024-09-19 19:46:30 -06:00

111 lines
2.4 KiB
Python

# MIT License
# Copyright (c) 2024 Jeff Moe
""" Read CSV contacts file exported from hubspot."""
import argparse
import csv
import pandas as pd
def parse_args():
parser = argparse.ArgumentParser(description="Parse Hubspot Contacts CSV Export")
parser.add_argument("csv_file", help="Contacts CSV File", type=str)
parser.add_argument(
"-d",
"--dump",
help="Dump CSV contents",
action="store_true",
)
parser.add_argument(
"-e",
"--empty",
help="List empty columns",
action="store_true",
)
parser.add_argument(
"-f",
"--fields",
help="Fields from CSV header",
action="store_true",
)
parser.add_argument(
"-n",
"--non_empty",
help="List number of non-empty values for each column",
action="store_true",
)
args = parser.parse_args()
return args
def csv_dump(CSV):
df = pd.read_csv(CSV, low_memory=False, chunksize=1000)
for chunk in df:
print(chunk.to_string())
def csv_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
empty_columns = [col for col in df.columns if df[col].isnull().all()]
if empty_columns:
print("Empty columns:")
print("\n".join(empty_columns))
else:
print("No empty columns found.")
def csv_fields(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
print("\n".join([col for col in df.columns]))
def csv_non_empty(CSV):
df = pd.read_csv(CSV, low_memory=False, header=0)
non_empty_columns = {
col: df[col].count() for col in df.columns if not df[col].isnull().all()
}
unique_counts = {col: df[col].nunique() for col in non_empty_columns.keys()}
sorted_columns = sorted(
unique_counts.items(),
key=lambda x: (unique_counts[x[0]], non_empty_columns[x[0]]),
reverse=True,
)
print("Column\tNon-empty values\tUnique values")
if sorted_columns:
for col, unique_count in sorted_columns:
count = non_empty_columns[col]
print(f"{col}\t{count}\t{unique_count}")
else:
print("No non-empty values found.")
def main():
args = parse_args()
CSV = args.csv_file
if args.dump:
csv_dump(CSV)
if args.empty:
csv_empty(CSV)
if args.fields:
csv_fields(CSV)
if args.non_empty:
csv_non_empty(CSV)
if __name__ == "__main__":
main()