#!/usr/bin/env python3
"""Extract content from Force Ouvrière weekly legal newsletter PDF.

Handles PDFs of any length (typically 2-4 pages).
- Page 1: actualités législatives + brèves de jurisprudence
- Pages 2-N: continuation of brèves + focus (may span multiple pages)

Usage:
    python extract_pdf.py document.pdf                    # Raw text (all pages)
    python extract_pdf.py document.pdf --markdown          # Markdown
    python extract_pdf.py document.pdf --page 1            # Page 1 only
    python extract_pdf.py document.pdf --structured        # Structured JSON
    python extract_pdf.py document.pdf --metadata          # Metadata
"""
import sys
import json


# === NOISE PATTERNS TO STRIP ===
NOISE_PREFIXES = [
    "Confédération", "VEILLE", "Secteur", "Du ", "141 avenue",
    "sjuridique", "pdrevon", "01.40", "I ", "141 avenue du Maine",
]


def _is_noise(line: str) -> bool:
    """Check if a line is header/footer noise."""
    if not line.strip():
        return True
    for prefix in NOISE_PREFIXES:
        if line.startswith(prefix):
            return True
    # Also check for page numbers alone
    if line.strip() in ("I 1", "I 2", "I 3", "1", "2", "3"):
        return True
    # Email addresses
    if "force-ouvriere.fr" in line:
        return True
    return False


def _clean_lines(text: str) -> list:
    """Extract non-noise lines from page text."""
    return [l.strip() for l in text.split('\n') if not _is_noise(l)]


def extract_text(path, pages=None):
    import pymupdf
    doc = pymupdf.open(path)
    page_range = range(len(doc)) if pages is None else pages
    for i in page_range:
        if i < len(doc):
            print(f"\n--- Page {i+1}/{len(doc)} ---\n")
            print(doc[i].get_text())


def extract_markdown(path, pages=None):
    import pymupdf4llm
    md = pymupdf4llm.to_markdown(path, pages=pages)
    print(md)


def extract_structured(path):
    """Extract structured content from FO legal newsletter PDF.

    Handles multi-page PDFs (2-4 pages typically):
    - Page 1: actualités législatives + brèves de jurisprudence
    - Pages 2+: brèves continuation + focus (may span multiple pages)
    """
    import pymupdf
    doc = pymupdf.open(path)

    # Read ALL pages as one continuous text stream
    all_text = ""
    page_texts = []
    for i in range(len(doc)):
        page_texts.append(doc[i].get_text())
        all_text += doc[i].get_text()

    # Also keep per-page data for metadata
    pages_data = {}
    for i in range(len(doc)):
        page_num = i + 1
        lines = _clean_lines(doc[i].get_text())
        pages_data[f"page_{page_num}"] = {
            "page": page_num,
            "raw_text": doc[i].get_text(),
            "clean_lines": lines,
        }

    # === PARSE AS CONTINUOUS STREAM ===
    all_lines = _clean_lines(all_text)

    # 1. Parse legislative section (always on page 1, before "Jurisprudence")
    actualites_legislatives = ""
    in_legislative = False
    legislative_lines = []

    for line in all_lines:
        if "Actualités législatives et réglementaires" in line:
            in_legislative = True
            continue
        if in_legislative and "Jurisprudence" in line:
            in_legislative = False
            if legislative_lines:
                actualites_legislatives = " ".join(legislative_lines)
            break
        if in_legislative:
            legislative_lines.append(line)

    # 2. Parse brèves (may span pages 1→2+)
    # Brèves start after "Jurisprudence" header and continue until "FOCUS"
    breves = []
    in_jurisprudence = False
    in_focus = False
    current_title = None
    current_body = []

    for line in all_lines:
        # Detect section transitions
        if "Jurisprudence" in line and "FOCUS" not in line:
            in_jurisprudence = True
            continue
        if "FOCUS" in line:
            # Save any pending brève
            if current_title and current_body:
                breves.append({
                    "title": current_title,
                    "body": " ".join(current_body),
                })
            in_focus = True
            in_jurisprudence = False
            current_title = None
            current_body = []
            continue

        if in_focus:
            continue

        if not in_jurisprudence:
            continue

        # Detect brève title: lines starting with ► or bold markdown **...**
        if line.startswith("► "):
            if current_title and current_body:
                breves.append({
                    "title": current_title,
                    "body": " ".join(current_body),
                })
            current_title = line.replace("► ", "").strip()
            current_body = []
        elif current_title:
            current_body.append(line)

    # Don't forget the last brève
    if current_title and current_body:
        breves.append({
            "title": current_title,
            "body": " ".join(current_body),
        })

    # 3. Parse focus (may span pages 2→3+)
    focus = _parse_focus_multi_page(all_lines)

    # === BUILD RESULT ===
    result = {
        "pages": len(doc),
        "metadata": doc.metadata,
        "actualites_legislatives": actualites_legislatives.strip(),
        "breves": breves,
        "focus": focus,
    }

    # Add per-page info
    for key, pd in pages_data.items():
        result[key] = pd

    print(json.dumps(result, indent=2, ensure_ascii=False))


def _parse_focus_multi_page(lines: list) -> dict:
    """Parse the legal focus, which may span multiple pages.

    Focus structure varies:
    - Title: often starts with "Attention" or is a bold heading
    - Sections: optional "Du côté de l'employeur", "Du côté du salarié"
    - Body: continuous text with multiple paragraphs
    - Arrêts: references like (Cass. soc., DATE, n°NUM)
    """
    focus = {
        "title": "",
        "sections": [],
        "body": "",
        "paragraphs": [],  # Split by blank lines for better processing
    }

    in_focus = False
    current_section = None
    section_body = []
    focus_lines = []

    for line in lines:
        if "FOCUS" in line:
            in_focus = True
            continue

        if not in_focus:
            continue

        focus_lines.append(line)

        # Detect section headers
        if line.strip() in ("Du côté de l'employeur", "Du côté du salarié"):
            if current_section and section_body:
                focus["sections"].append({
                    "title": current_section,
                    "body": " ".join(section_body),
                })
            current_section = line.strip()
            section_body = []
            continue

        if current_section:
            section_body.append(line)

    # Save last section
    if current_section and section_body:
        focus["sections"].append({
            "title": current_section,
            "body": " ".join(section_body),
        })

    # Extract title: first meaningful line after FOCUS
    # Could be "Attention ..." or a bold heading
    for line in focus_lines:
        stripped = line.strip()
        if not stripped:
            continue
        if stripped.startswith("Attention") or stripped.startswith("**Attention"):
            focus["title"] = stripped.replace("**", "")
            break
        # Fallback: first non-empty line is the title
        focus["title"] = stripped.replace("**", "")
        break

    # Build body: all focus lines joined
    focus["body"] = " ".join(l for l in focus_lines if l.strip())

    # Split into paragraphs (by blank lines) for better segment processing
    para_lines = []
    current_para = []
    for line in focus_lines:
        if line.strip():
            current_para.append(line)
        else:
            if current_para:
                para_lines.append(" ".join(current_para))
                current_para = []
    if current_para:
        para_lines.append(" ".join(current_para))
    focus["paragraphs"] = para_lines

    return focus


def show_metadata(path):
    import pymupdf
    doc = pymupdf.open(path)
    print(json.dumps({
        "pages": len(doc),
        "title": doc.metadata.get("title", ""),
        "author": doc.metadata.get("author", ""),
        "subject": doc.metadata.get("subject", ""),
        "creator": doc.metadata.get("creator", ""),
        "producer": doc.metadata.get("producer", ""),
        "format": doc.metadata.get("format", ""),
    }, indent=2))


if __name__ == "__main__":
    args = sys.argv[1:]
    if not args or args[0] in ("-h", "--help"):
        print(__doc__)
        sys.exit(0)

    path = args[0]
    pages = None

    if "--page" in args:
        idx = args.index("--page")
        page_num = int(args[idx + 1])
        pages = [page_num - 1]  # Convert to 0-indexed

    if "--metadata" in args:
        show_metadata(path)
    elif "--structured" in args:
        extract_structured(path)
    elif "--markdown" in args:
        extract_markdown(path, pages=pages)
    else:
        extract_text(path, pages=pages)