← Back to Library Pop-Up Tools

Catalog Cleaner (catalog_cleaner.py)

This tool cleans an exported catalog CSV: trims punctuation, normalizes titles/authors/years, and adds a simple suspicious_isbn flag when an ISBN length looks unusual.

How It Works (In Plain Language)

  • Reads your CSV file row by row using Python’s built-in csv module.
  • Cleans author names by trimming spaces and removing dots/commas at the end (for example "Smith, Jane.""Smith, Jane").
  • Cleans titles by trimming spaces and removing trailing punctuation like / : ; , often found in exports.
  • Normalizes years by pulling out the first 4 digits it finds (for example "c2017.""2017").
  • Checks ISBNs by counting digits; if it’s not 10 or 13 digits, it marks suspicious_isbn = "yes".
  • Writes a new CSV with all the original columns plus the new suspicious_isbn column.

How to Use

  1. Place your catalog CSV (for example my_catalog.csv) in the same folder as this file.
  2. Open Terminal and run: cd ~/Desktop/library_pop_up_tools
  3. Then run: python catalog_cleaner.py my_catalog.csv cleaned_catalog.csv
  4. Open cleaned_catalog.csv in Excel/Numbers to review.
library_pop_up_tools % python catalog_cleaner.py my_catalog.csv cleaned_catalog.csv
Cleaned file written to: cleaned_catalog.csv

Optional: Adjust What It Cleans

You can use this tool without changing anything. The small helper functions below are just “rules” for how to clean text. Only change them if you want different trimming or year/ISBN handling.

def normalize_author(name: str) -> str:
    # trim spaces and trailing punctuation
    ...

def normalize_title(title: str) -> str:
    # trim spaces and remove trailing / : ; ,
    ...

def normalize_year(year: str) -> str:
    # keep the first 4 digits (e.g. "c2017." → "2017")
    ...

def is_suspicious_isbn(isbn: str) -> bool:
    # mark as suspicious if digit count is not 10 or 13
    ...

Full Python Source (Optional)

Click to show the full script
#!/usr/bin/env python3
"""
catalog_cleaner.py

Pop-up tool for cleaning exported catalog/collection data (CSV).
Safe to run locally; it only reads an input CSV and writes a new cleaned CSV.

Example:
    python catalog_cleaner.py input.csv cleaned_output.csv
"""

import csv
import sys
from pathlib import Path


def normalize_author(name: str) -> str:
    if not name:
        return ""
    name = name.strip()
    while name and name[-1] in ".,":
        name = name[:-1].strip()
    return name


def normalize_title(title: str) -> str:
    if not title:
        return ""
    title = title.strip()
    while title and title[-1] in "/:;,":
        title = title[:-1].strip()
    return title


def normalize_year(year: str) -> str:
    if not year:
        return ""
    year = year.strip()
    digits = "".join(ch for ch in year if ch.isdigit())
    if len(digits) >= 4:
        return digits[:4]
    return year


def is_suspicious_isbn(isbn: str) -> bool:
    if not isbn:
        return False
    digits = [ch for ch in isbn if ch.isdigit()]
    return len(digits) not in (10, 13)


def clean_catalog(input_path: Path, output_path: Path) -> None:
    with input_path.open(newline="", encoding="utf-8-sig") as infile, output_path.open(
        "w", newline="", encoding="utf-8"
    ) as outfile:
        reader = csv.DictReader(infile)
        fieldnames = list(reader.fieldnames or [])
        if "suspicious_isbn" not in fieldnames:
            fieldnames.append("suspicious_isbn")
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            for key in row:
                if row[key] is None:
                    row[key] = ""

            author_keys = [k for k in row.keys() if k.lower() in ("author", "main_author")]
            title_keys = [k for k in row.keys() if k.lower() in ("title", "main_title")]
            year_keys = [k for k in row.keys() if "year" in k.lower() or "pub" in k.lower()]
            isbn_keys = [k for k in row.keys() if "isbn" in k.lower()]

            for k in author_keys:
                row[k] = normalize_author(row[k])
            for k in title_keys:
                row[k] = normalize_title(row[k])
            for k in year_keys:
                row[k] = normalize_year(row[k])

            suspicious = False
            for k in isbn_keys:
                if is_suspicious_isbn(row[k]):
                    suspicious = True
                    break
            row["suspicious_isbn"] = "yes" if suspicious else ""

            writer.writerow(row)


def main(argv: list[str]) -> int:
    if len(argv) != 3:
        print("Usage: python catalog_cleaner.py input.csv cleaned_output.csv")
        return 1
    input_path = Path(argv[1]).expanduser()
    output_path = Path(argv[2]).expanduser()
    if not input_path.exists():
        print(f"Input file not found: {input_path}")
        return 1
    clean_catalog(input_path, output_path)
    print(f"Cleaned file written to: {output_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main(sys.argv))

← Back to all tools