#!/usr/bin/env python3 """ Convert raw Office/PDF files into markdown with YAML frontmatter. Usage: python convert.py # convert all files in raw/ python convert.py path/to/file.docx # convert a single file python convert.py --out documents # override output subfolder Reads from: database/seeders/data/raw/ Writes to: database/seeders/data/documents/ (default, or specify --out) Supported formats: .docx, .pptx, .xlsx, .pdf, .txt, .md, .csv """ import argparse import hashlib import re import sys from datetime import date from pathlib import Path # ── Extractors ────────────────────────────────────────────────────── def extract_docx(path: Path) -> tuple[str, dict]: """Extract text from Word .docx files, preserving heading structure.""" from docx import Document doc = Document(str(path)) lines = [] for para in doc.paragraphs: text = para.text.strip() if not text: lines.append("") continue style = para.style.name.lower() if para.style else "" if "heading 1" in style: lines.append(f"# {text}") elif "heading 2" in style: lines.append(f"## {text}") elif "heading 3" in style: lines.append(f"### {text}") elif "title" in style: lines.append(f"# {text}") else: lines.append(text) # Extract tables for table in doc.tables: lines.append("") for i, row in enumerate(table.rows): cells = [cell.text.strip().replace("\n", " ") for cell in row.cells] lines.append("| " + " | ".join(cells) + " |") if i == 0: lines.append("| " + " | ".join(["---"] * len(cells)) + " |") lines.append("") meta = {} props = doc.core_properties if props.author: meta["auteur"] = props.author if props.title: meta["titel"] = props.title if props.created: meta["datum"] = props.created.strftime("%Y-%m-%d") content = "\n".join(lines).strip() # Clean up excessive blank lines content = re.sub(r"\n{3,}", "\n\n", content) return content, meta def extract_pptx(path: Path) -> tuple[str, dict]: """Extract text from PowerPoint .pptx files, one section per slide.""" from pptx import Presentation prs = Presentation(str(path)) lines = [] for i, slide in enumerate(prs.slides, 1): slide_title = "" slide_texts = [] for shape in slide.shapes: if shape.has_text_frame: for para in shape.text_frame.paragraphs: text = para.text.strip() if text: slide_texts.append(text) if hasattr(shape, "name") and "title" in shape.name.lower(): if shape.has_text_frame: title_text = shape.text_frame.text.strip() if title_text: slide_title = title_text # Extract tables from slides for shape in slide.shapes: if shape.has_table: table = shape.table for row_idx, row in enumerate(table.rows): cells = [cell.text.strip().replace("\n", " ") for cell in row.cells] slide_texts.append("| " + " | ".join(cells) + " |") if row_idx == 0: slide_texts.append("| " + " | ".join(["---"] * len(cells)) + " |") heading = slide_title or f"Slide {i}" lines.append(f"## {heading}") lines.append("") lines.extend(slide_texts) lines.append("") # Slide notes if slide.has_notes_slide and slide.notes_slide.notes_text_frame: notes = slide.notes_slide.notes_text_frame.text.strip() if notes: lines.append(f"> Notities: {notes}") lines.append("") meta = {} content = "\n".join(lines).strip() content = re.sub(r"\n{3,}", "\n\n", content) return content, meta def extract_xlsx(path: Path) -> tuple[str, dict]: """Extract data from Excel .xlsx files, one section per sheet.""" from openpyxl import load_workbook wb = load_workbook(str(path), data_only=True) lines = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] rows = list(ws.iter_rows(values_only=True)) if not rows: continue lines.append(f"## {sheet_name}") lines.append("") for i, row in enumerate(rows): cells = [str(cell).strip() if cell is not None else "" for cell in row] # Skip completely empty rows if not any(cells): continue lines.append("| " + " | ".join(cells) + " |") if i == 0: lines.append("| " + " | ".join(["---"] * len(cells)) + " |") lines.append("") meta = {} content = "\n".join(lines).strip() content = re.sub(r"\n{3,}", "\n\n", content) return content, meta def extract_pdf(path: Path) -> tuple[str, dict]: """Extract text from PDF files using PyMuPDF.""" import fitz # pymupdf doc = fitz.open(str(path)) lines = [] for page_num, page in enumerate(doc, 1): text = page.get_text("text").strip() if text: if len(doc) > 1: lines.append(f"## Pagina {page_num}") lines.append("") lines.append(text) lines.append("") meta = {} pdf_meta = doc.metadata if pdf_meta: if pdf_meta.get("author"): meta["auteur"] = pdf_meta["author"] if pdf_meta.get("title"): meta["titel"] = pdf_meta["title"] if pdf_meta.get("creationDate"): try: raw = pdf_meta["creationDate"] # PDF dates: D:YYYYMMDDHHmmSS if raw.startswith("D:"): raw = raw[2:] meta["datum"] = f"{raw[:4]}-{raw[4:6]}-{raw[6:8]}" except (IndexError, ValueError): pass doc.close() content = "\n".join(lines).strip() content = re.sub(r"\n{3,}", "\n\n", content) return content, meta def extract_text(path: Path) -> tuple[str, dict]: """Read plain text / markdown / csv files.""" import chardet raw_bytes = path.read_bytes() detected = chardet.detect(raw_bytes) encoding = detected.get("encoding", "utf-8") or "utf-8" try: content = raw_bytes.decode(encoding) except (UnicodeDecodeError, LookupError): content = raw_bytes.decode("utf-8", errors="replace") return content.strip(), {} # ── File type routing ─────────────────────────────────────────────── EXTRACTORS = { ".docx": extract_docx, ".doc": None, # needs LibreOffice, warn user ".pptx": extract_pptx, ".ppt": None, ".xlsx": extract_xlsx, ".xls": None, ".pdf": extract_pdf, ".txt": extract_text, ".md": extract_text, ".csv": extract_text, } def slugify(text: str) -> str: """Convert text to a filename-safe slug.""" text = text.lower().strip() text = re.sub(r"[^\w\s-]", "", text) text = re.sub(r"[\s_]+", "-", text) text = re.sub(r"-+", "-", text) return text[:80].strip("-") def build_frontmatter(filename: str, meta: dict) -> str: """Build YAML frontmatter from extracted metadata + filename.""" titel = meta.get("titel") or filename.rsplit(".", 1)[0].replace("-", " ").replace("_", " ").title() auteur = meta.get("auteur", "") datum = meta.get("datum", date.today().isoformat()) lines = ["---"] lines.append(f"titel: \"{titel}\"") if auteur: lines.append(f"auteur: \"{auteur}\"") lines.append(f"type: document") lines.append(f"datum: {datum}") lines.append(f"bron: \"{filename}\"") lines.append("---") return "\n".join(lines) def convert_file(file_path: Path, out_dir: Path) -> Path | None: """Convert a single file to markdown with frontmatter.""" suffix = file_path.suffix.lower() if suffix not in EXTRACTORS: print(f" SKIP {file_path.name} — unsupported format ({suffix})") return None if EXTRACTORS[suffix] is None: print(f" SKIP {file_path.name} — old Office format ({suffix}), save as {suffix}x first") return None try: content, meta = EXTRACTORS[suffix](file_path) except Exception as e: print(f" ERROR {file_path.name} — {e}") return None if not content.strip(): print(f" EMPTY {file_path.name} — no text extracted") return None frontmatter = build_frontmatter(file_path.name, meta) slug = slugify(meta.get("titel", file_path.stem)) out_path = out_dir / f"{slug}.md" # Avoid overwriting — append hash if collision if out_path.exists(): short_hash = hashlib.md5(file_path.name.encode()).hexdigest()[:6] out_path = out_dir / f"{slug}-{short_hash}.md" out_path.write_text(f"{frontmatter}\n\n{content}\n", encoding="utf-8") print(f" OK {file_path.name} → {out_path.relative_to(out_dir.parent)}") return out_path # ── CLI ───────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Convert Office/PDF files to markdown for seeding") parser.add_argument("files", nargs="*", help="Specific files to convert (default: all in raw/)") parser.add_argument("--out", default="documents", help="Output subfolder name (default: documents)") parser.add_argument("--data-dir", default=None, help="Override data directory path") args = parser.parse_args() # Resolve paths script_dir = Path(__file__).resolve().parent project_root = script_dir.parent data_dir = Path(args.data_dir) if args.data_dir else project_root / "database" / "seeders" / "data" raw_dir = data_dir / "raw" out_dir = data_dir / args.out out_dir.mkdir(parents=True, exist_ok=True) if args.files: files = [Path(f) for f in args.files] else: if not raw_dir.exists(): print(f"No raw/ directory at {raw_dir}") sys.exit(1) files = sorted(f for f in raw_dir.iterdir() if f.is_file() and not f.name.startswith(".")) if not files: print(f"No files found in {raw_dir}") print("Drop .docx, .pptx, .xlsx, .pdf files there and re-run.") sys.exit(0) print(f"Converting {len(files)} file(s) → {out_dir.relative_to(project_root)}/\n") converted = 0 for f in files: result = convert_file(f, out_dir) if result: converted += 1 print(f"\nDone: {converted}/{len(files)} files converted.") if __name__ == "__main__": main()