"""
SiteHub LinkedIn ICP Tracker — Weekly Processing Script
Handles: parsing, company enrichment, classification, tracker updates.
"""

import re
import time
import os
from datetime import datetime
from pathlib import Path

try:
    import openpyxl
    from openpyxl.utils import get_column_letter
    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
    from openpyxl.chart import BarChart, LineChart, Reference
    from openpyxl.chart.label import DataLabelList
except ImportError:
    raise SystemExit("openpyxl required: pip install openpyxl")

# ---------------------------------------------------------------------------
# Paths (defaults — override via function args)
# ---------------------------------------------------------------------------
BASE_DIR = Path(__file__).resolve().parent
TRACKER_PATH = BASE_DIR / "SiteHub_LinkedIn_ICP_Tracker.xlsx"
REFERENCE_PATH = BASE_DIR / "ICP_Reference_Lists.xlsx"
CONFIG_PATH = BASE_DIR / "icp_config.json"

# ---------------------------------------------------------------------------
# Load config (API keys, etc.)
# ---------------------------------------------------------------------------
def _load_config() -> dict:
    """Load configuration from icp_config.json."""
    if CONFIG_PATH.exists():
        try:
            import json
            with open(CONFIG_PATH, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            pass
    return {}

_CONFIG = _load_config()

# Set environment variables from config if not already set
if _CONFIG.get("brave_api_key") and not os.environ.get("BRAVE_API_KEY"):
    if _CONFIG["brave_api_key"] != "PASTE_YOUR_BRAVE_API_KEY_HERE":
        os.environ["BRAVE_API_KEY"] = _CONFIG["brave_api_key"]

# ---------------------------------------------------------------------------
# Database mode (--use-db flag)
# ---------------------------------------------------------------------------
_USE_DB = False       # Set by --use-db flag in main()
_DB_SESSION = None    # Singleton SQLAlchemy session per run


def _get_db_session():
    """Lazy session getter for DB mode."""
    global _DB_SESSION
    if _DB_SESSION is None:
        from icp_db import get_session
        _DB_SESSION = get_session()
    return _DB_SESSION


# ---------------------------------------------------------------------------
# LinkedIn Data Cleaning — remove connection degrees and other noise
# ---------------------------------------------------------------------------
# Patterns for connection degree indicators (1st, 2nd, 3rd, 3rd+, etc.)
_CONNECTION_DEGREE_PATTERNS = [
    r"^\s*1st\s*$",           # Line containing only "1st"
    r"^\s*2nd\s*$",           # Line containing only "2nd"
    r"^\s*3rd\+?\s*$",        # Line containing only "3rd" or "3rd+"
    r"^\s*\d+(?:st|nd|rd|th)\+?\s+degree\s*$",  # "1st degree", "2nd degree", etc.
    r"^\s*\d+(?:st|nd|rd|th)\+?\s+connection\s*$",  # "1st connection", etc.
    r"^\s*\d+(?:st|nd|rd|th)\+?\s+degree\s+connection\s*$",  # "2nd degree connection"
]
_CONNECTION_DEGREE_RE = re.compile("|".join(_CONNECTION_DEGREE_PATTERNS), re.IGNORECASE)

# Inline connection degree (to strip from names/titles)
_INLINE_DEGREE_RE = re.compile(
    r"\s*[·•\-–—|]\s*\d+(?:st|nd|rd|th)\+?(?:\s+degree)?(?:\s+connection)?\s*$",
    re.IGNORECASE
)

# Lines to skip entirely (LinkedIn noise)
_SKIP_LINE_PATTERNS = [
    r"^\s*like\s*$",          # Reaction type
    r"^\s*support\s*$",       # Reaction type
    r"^\s*celebrate\s*$",     # Reaction type
    r"^\s*love\s*$",          # Reaction type
    r"^\s*insightful\s*$",    # Reaction type
    r"^\s*funny\s*$",         # Reaction type
    r"^\s*curious\s*$",       # Reaction type
    r"^\s*View\s+.+profile",  # "View X's profile..."
    r"^\s*\d+[,.]?\d*\s*followers?\s*$",  # "13,447 followers"
    # Note: bare dash "-" is NOT skipped — it's a valid LinkedIn title meaning "no title set"
]
_SKIP_LINE_RE = re.compile("|".join(_SKIP_LINE_PATTERNS), re.IGNORECASE)


def _clean_linkedin_data(text: str) -> str:
    """Clean LinkedIn-specific noise from pasted data.

    Removes:
    - Reaction type lines (like, support, celebrate, insightful, etc.)
    - "View X's profile" lines and suffixes (e.g., "* NameView Name's profile" → "Name")
    - Connection degree lines ("1st", "2nd", "3rd+", "2nd degree connection· 1st")
    - Follower count lines
    - Company/organization pages (identified by having a "followers" line after them)
    - Inline connection indicators at end of names/titles
    """
    lines = text.splitlines()
    cleaned_lines = []

    # Pattern to detect "followers" lines (indicates previous line was a company page)
    followers_pattern = re.compile(r"^\s*\d+[,.]?\d*\s*followers?\s*$", re.IGNORECASE)

    # Pattern to detect "View X's profile" suffix in name lines
    # Matches: "NameView Name's profile" (LinkedIn accessibility text appended to names)
    _view_profile_re = re.compile(r"View\s+.+?['\u2019\u0027\u02BC]s?\s+profile\s*$", re.IGNORECASE)

    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Skip empty lines
        if not stripped:
            i += 1
            continue

        # Handle "* NameView Name's profile" lines → extract just the name
        view_match = _view_profile_re.search(stripped)
        if view_match:
            name_part = stripped[:view_match.start()].strip()
            # Strip leading "* " (LinkedIn list marker)
            if name_part.startswith("*"):
                name_part = name_part[1:].strip()
            if name_part:
                cleaned_lines.append(name_part)
            i += 1
            continue

        # Strip inline degree suffix FIRST (e.g., "1st degree connection· 1st" → "1st degree connection")
        cleaned_stripped = _INLINE_DEGREE_RE.sub("", stripped).strip()

        # Skip reaction types, profile view lines, follower counts
        if _SKIP_LINE_RE.match(cleaned_stripped):
            i += 1
            continue

        # Skip lines that are just connection degree indicators (checked after inline stripping)
        if _CONNECTION_DEGREE_RE.match(cleaned_stripped):
            i += 1
            continue

        # Check if next line is a "followers" line - if so, current line is a company page
        if i + 1 < len(lines):
            next_stripped = lines[i + 1].strip()
            if followers_pattern.match(next_stripped):
                # Skip this company page and the followers line
                i += 2
                continue

        cleaned_lines.append(cleaned_stripped)
        i += 1

    return "\n".join(cleaned_lines)


# ---------------------------------------------------------------------------
# SiteHub Employee Filter — exclude internal employees from tracker
# ---------------------------------------------------------------------------
_SITEHUB_PATTERNS = [
    r"sitehub",
    r"site\s*hub",
    r"site-hub",
]
_SITEHUB_RE = re.compile("|".join(_SITEHUB_PATTERNS), re.IGNORECASE)

# Known SiteHub employees (names that don't always show SiteHub in title)
_SITEHUB_EMPLOYEES = {
    "lau mikkelsen",
    "ulrik branner",
    "andré sode",
    "andre sode",
    "morten abildgaard",
    "jens peter sørensen",
    "jens peter sorensen",
    "mikkel bredsgaard",
    "mikkel poulsen",
    "oliver sebastian jensen",
    "diana hervø",
    "diana hervo",
    "rie l.m. petersen",
    "william peter roust sørensen",
    "william peter roust sorensen",
    "jan pedersen",
    "henna ahmed",
    "howard wills",
    "julie branner",
    "mette ankerstjerne",
}

# File for additional SiteHub employees added during review
_SITEHUB_EMPLOYEES_FILE = BASE_DIR / "sitehub_employees.txt"

def _load_additional_sitehub_employees():
    """Load additional SiteHub employees from file (or DB)."""
    if _USE_DB:
        from icp_db import load_sitehub_employees
        db_names = load_sitehub_employees(_get_db_session())
        _SITEHUB_EMPLOYEES.update(db_names)
        return

    if _SITEHUB_EMPLOYEES_FILE.exists():
        try:
            with open(_SITEHUB_EMPLOYEES_FILE, "r", encoding="utf-8") as f:
                for line in f:
                    name = line.strip().lower()
                    if name and not name.startswith("#"):
                        _SITEHUB_EMPLOYEES.add(name)
        except Exception:
            pass

def _add_sitehub_employee(name: str) -> bool:
    """Add a name to the SiteHub employees list (persistent)."""
    name_lower = name.lower().strip()
    if name_lower in _SITEHUB_EMPLOYEES:
        return False  # Already exists

    _SITEHUB_EMPLOYEES.add(name_lower)

    if _USE_DB:
        from icp_db import add_sitehub_employee
        add_sitehub_employee(_get_db_session(), name_lower)
        return True

    # Append to file for persistence
    try:
        with open(_SITEHUB_EMPLOYEES_FILE, "a", encoding="utf-8") as f:
            f.write(f"{name_lower}\n")
        return True
    except Exception:
        return False

# Load additional employees on module import
_load_additional_sitehub_employees()


# ---------------------------------------------------------------------------
# Classification Pattern Learning
# ---------------------------------------------------------------------------
_LEARNED_PATTERNS_FILE = BASE_DIR / "learned_classification_patterns.json"

# Structure: {
#   "title_keywords": {
#       "projektleder": {"ICP": 5, "POI": 1, "Not ICP": 0},
#       "byggeleder": {"ICP": 8, "POI": 0, "Not ICP": 0},
#       ...
#   },
#   "companies": {
#       "sund & bælt": {"status": "ICP", "count": 2},
#       ...
#   }
# }
_LEARNED_PATTERNS: dict = {"title_keywords": {}, "companies": {}}


def _load_learned_patterns():
    """Load learned classification patterns from file (or DB)."""
    global _LEARNED_PATTERNS
    if _USE_DB:
        from icp_db import load_learned_patterns
        _LEARNED_PATTERNS = load_learned_patterns(_get_db_session())
        return

    if _LEARNED_PATTERNS_FILE.exists():
        try:
            import json
            with open(_LEARNED_PATTERNS_FILE, "r", encoding="utf-8") as f:
                _LEARNED_PATTERNS = json.load(f)
        except Exception:
            _LEARNED_PATTERNS = {"title_keywords": {}, "companies": {}}


def _save_learned_patterns():
    """Save learned classification patterns to file (or DB)."""
    if _USE_DB:
        from icp_db import save_learned_pattern_title, save_learned_pattern_company
        session = _get_db_session()
        for keyword, counts in _LEARNED_PATTERNS.get("title_keywords", {}).items():
            save_learned_pattern_title(
                session, keyword,
                counts.get("ICP", 0),
                counts.get("POI", 0),
                counts.get("Not ICP", 0),
            )
        for company, info in _LEARNED_PATTERNS.get("companies", {}).items():
            save_learned_pattern_company(
                session, company,
                info.get("status", "Review"),
                info.get("count", 0),
            )
        return True

    try:
        import json
        with open(_LEARNED_PATTERNS_FILE, "w", encoding="utf-8") as f:
            json.dump(_LEARNED_PATTERNS, f, indent=2, ensure_ascii=False)
        return True
    except Exception:
        return False


def _extract_title_keywords(title: str) -> list[str]:
    """Extract meaningful keywords from a job title."""
    if not title:
        return []

    title_lower = title.lower()

    # Key role indicators to track
    role_keywords = [
        "projektleder", "byggeleder", "direktør", "director", "chef", "leder",
        "manager", "head", "lead", "koordinator", "coordinator", "specialist",
        "konsulent", "consultant", "rådgiver", "advisor", "ingeniør", "engineer",
        "arkitekt", "architect", "udvikler", "developer", "analyst", "analytiker",
        "ceo", "cfo", "cto", "coo", "vp", "partner", "founder", "owner", "ejer",
        "senior", "junior", "trainee", "praktikant", "studerende", "student",
        "operations", "logistik", "logistics", "bæredygtighed", "sustainability",
        "sikkerhed", "safety", "hse", "kvalitet", "quality", "indkøb", "procurement",
        "salg", "sales", "marketing", "hr", "økonomi", "finance", "it", "digital"
    ]

    found = []
    for keyword in role_keywords:
        if keyword in title_lower:
            found.append(keyword)

    return found


def learn_from_classification(name: str, title: str, company: str | None, status: str):
    """Learn patterns from a manual classification decision.

    Extracts title keywords and company, associates them with the classification status.
    """
    if status not in ("ICP", "POI", "Not ICP"):
        return  # Only learn from definitive classifications

    # Learn from title keywords
    keywords = _extract_title_keywords(title)
    for keyword in keywords:
        if keyword not in _LEARNED_PATTERNS["title_keywords"]:
            _LEARNED_PATTERNS["title_keywords"][keyword] = {"ICP": 0, "POI": 0, "Not ICP": 0}
        _LEARNED_PATTERNS["title_keywords"][keyword][status] += 1

    # Learn from company (if not SiteHub)
    if company and not _SITEHUB_RE.search(company):
        company_lower = company.lower().strip()
        if company_lower not in _LEARNED_PATTERNS["companies"]:
            _LEARNED_PATTERNS["companies"][company_lower] = {"status": status, "count": 1}
        else:
            # Update with most recent classification, increment count
            _LEARNED_PATTERNS["companies"][company_lower]["status"] = status
            _LEARNED_PATTERNS["companies"][company_lower]["count"] += 1

    # Save after learning
    _save_learned_patterns()


def get_learned_suggestion(title: str, company: str | None) -> tuple[str | None, str | None, float]:
    """Get a classification suggestion based on learned patterns.

    Returns (suggested_status, reason, confidence) or (None, None, 0) if no suggestion.
    Confidence is 0.0 to 1.0.
    """
    suggestions = {"ICP": 0.0, "POI": 0.0, "Not ICP": 0.0}
    reasons = []

    # Check company patterns first (higher weight)
    if company:
        company_lower = company.lower().strip()
        if company_lower in _LEARNED_PATTERNS.get("companies", {}):
            entry = _LEARNED_PATTERNS["companies"][company_lower]
            if entry["count"] >= 2:  # Require at least 2 occurrences for confidence
                suggestions[entry["status"]] += 0.6  # 60% weight for company match
                reasons.append(f"company:{company_lower}({entry['count']}x)")

    # Check title keyword patterns
    keywords = _extract_title_keywords(title)
    for keyword in keywords:
        if keyword in _LEARNED_PATTERNS.get("title_keywords", {}):
            counts = _LEARNED_PATTERNS["title_keywords"][keyword]
            total = sum(counts.values())
            if total >= 3:  # Require at least 3 occurrences for keyword patterns
                for status in ("ICP", "POI", "Not ICP"):
                    if counts[status] > 0:
                        weight = (counts[status] / total) * 0.3  # 30% weight per keyword
                        suggestions[status] += weight
                        if counts[status] == max(counts.values()):
                            reasons.append(f"title:{keyword}({counts[status]}/{total})")

    # Find best suggestion
    best_status = max(suggestions, key=suggestions.get)
    confidence = suggestions[best_status]

    if confidence >= 0.5:  # Only suggest if confidence >= 50%
        return best_status, "+".join(reasons[:3]), confidence
    return None, None, 0.0


def show_learned_patterns_summary():
    """Print a summary of learned patterns."""
    print("\n=== LEARNED CLASSIFICATION PATTERNS ===\n")

    # Title keywords
    print("TITLE KEYWORDS (sorted by total occurrences):")
    keywords = _LEARNED_PATTERNS.get("title_keywords", {})
    sorted_keywords = sorted(keywords.items(), key=lambda x: sum(x[1].values()), reverse=True)

    for keyword, counts in sorted_keywords[:20]:
        total = sum(counts.values())
        dominant = max(counts, key=counts.get)
        pct = counts[dominant] / total * 100 if total > 0 else 0
        print(f"  {keyword:20} -> {dominant:8} ({pct:4.0f}%) [ICP:{counts['ICP']} POI:{counts['POI']} Not:{counts['Not ICP']}]")

    # Companies
    print(f"\nLEARNED COMPANIES ({len(_LEARNED_PATTERNS.get('companies', {}))} total):")
    companies = _LEARNED_PATTERNS.get("companies", {})
    sorted_companies = sorted(companies.items(), key=lambda x: x[1]["count"], reverse=True)

    for company, data in sorted_companies[:15]:
        print(f"  {company:30} -> {data['status']:8} ({data['count']}x)")


def bootstrap_learning_from_existing(tracker_path: str | Path | None = None, verbose: bool = True):
    """Learn patterns from all existing manually classified contacts.

    This bootstraps the learning system from historical classification decisions.
    Only learns from contacts with status ICP, POI, or Not ICP.
    """
    tracker_path = Path(tracker_path or TRACKER_PATH)

    if verbose:
        print(f"=== Bootstrapping Learning from Existing Classifications ===")
        print(f"Tracker: {tracker_path.name}\n")

    wb = openpyxl.load_workbook(tracker_path)
    ws = wb["Contacts"]
    headers = [cell.value for cell in ws[1]]

    name_idx = headers.index("Name")
    title_idx = headers.index("Title")
    company_idx = headers.index("Company")
    status_idx = headers.index("Status")

    learned_count = {"ICP": 0, "POI": 0, "Not ICP": 0}

    for row in ws.iter_rows(min_row=2, values_only=True):
        name = row[name_idx]
        title = row[title_idx] or ""
        company = row[company_idx] or ""
        status = row[status_idx]

        if status in ("ICP", "POI", "Not ICP"):
            # Skip SiteHub employees
            if _is_sitehub_employee(company, title, name):
                continue

            learn_from_classification(name, title, company, status)
            learned_count[status] += 1

    wb.close()

    if verbose:
        print(f"Learned from existing classifications:")
        print(f"  ICP: {learned_count['ICP']}")
        print(f"  POI: {learned_count['POI']}")
        print(f"  Not ICP: {learned_count['Not ICP']}")
        print(f"  Total: {sum(learned_count.values())}")
        print()
        show_learned_patterns_summary()

    return learned_count


# Load learned patterns on module import
_load_learned_patterns()


def _is_sitehub_employee(company: str | None, title: str | None = None, name: str | None = None) -> bool:
    """Check if contact appears to be a SiteHub employee.

    Checks company name, title for SiteHub references, and name against known employees.
    """
    if company and _SITEHUB_RE.search(company):
        return True
    if title and _SITEHUB_RE.search(title):
        return True
    if name and name.lower().strip() in _SITEHUB_EMPLOYEES:
        return True
    return False


# ---------------------------------------------------------------------------
# Phase 1: Company Name Normalization
# ---------------------------------------------------------------------------
# Common company suffixes to strip for matching
_COMPANY_SUFFIXES = [
    r"\s+a/s$", r"\s+aps$", r"\s+a\.s\.$", r"\s+ApS$", r"\s+A/S$",
    r"\s+as$", r"\s+AS$",
    # Hyphen variants
    r"\s+a-s$", r"\s+ap-s$",
    # Partnerships
    r"\s+k/s$", r"\s+p/s$", r"\s+s/l$",
    # Cooperative
    r"\s+amba$",
    r"\s+danmark$", r"\s+denmark$", r"\s+dk$",
    r"\s+i/s$", r"\s+I/S$",
    r"\s+holding$", r"\s+group$", r"\s+koncern$",
    r"\s+gmbh$", r"\s+inc\.?$", r"\s+ltd\.?$", r"\s+llc$",
    r"\s+co\.?$", r"\s+corp\.?$",
]
_COMPANY_SUFFIX_STRIP_RE = re.compile(
    "|".join(_COMPANY_SUFFIXES), re.IGNORECASE
)


def normalize_company_name(name: str) -> str:
    """Normalize a company name by removing common suffixes and cleaning up.

    Examples:
        "Enemærke & Petersen A/S" -> "enemærke & petersen"
        "NCC Danmark" -> "ncc"
        "COWI A/S" -> "cowi"
        "CG Jensen | Bestyrelsesarbejde..." -> "cg jensen"
    """
    if not name:
        return ""

    # Lowercase
    normalized = name.strip().lower()

    # Convert hyphens to spaces for matching
    normalized = normalized.replace('-', ' ')

    # Strip text after pipe or long dash (often extra descriptions)
    for sep in [" | ", " - ", " – ", " — "]:
        if sep in normalized:
            normalized = normalized.split(sep)[0].strip()

    # Remove common suffixes
    normalized = _COMPANY_SUFFIX_STRIP_RE.sub("", normalized).strip()

    # Normalize whitespace
    normalized = " ".join(normalized.split())

    return normalized


# ---------------------------------------------------------------------------
# Phase 2: Fuzzy Company Matching
# ---------------------------------------------------------------------------
def _normalize_danish_chars(s: str) -> str:
    """Normalize Danish special characters for matching.

    Fixes fuzzy matching failures like "Højgaard" vs "Hoejgaard".
    """
    replacements = {
        'æ': 'ae', 'ø': 'oe', 'å': 'aa',
        'Æ': 'Ae', 'Ø': 'Oe', 'Å': 'Aa',
        'é': 'e', 'è': 'e', 'ê': 'e',
        'ü': 'u', 'ö': 'o', 'ä': 'a',
    }
    for char, replacement in replacements.items():
        s = s.replace(char, replacement)
    return s


def _similarity_ratio(s1: str, s2: str) -> float:
    """Calculate similarity ratio between two strings (0.0 to 1.0).

    Uses a simple character overlap approach (no external dependencies).
    Based on Dice coefficient: 2 * |intersection| / (|s1| + |s2|)
    """
    if not s1 or not s2:
        return 0.0

    # Normalize Danish characters before comparison
    s1 = _normalize_danish_chars(s1.lower())
    s2 = _normalize_danish_chars(s2.lower())

    # For short strings, use exact substring matching
    if len(s1) <= 4 or len(s2) <= 4:
        if s1 in s2 or s2 in s1:
            return len(min(s1, s2, key=len)) / len(max(s1, s2, key=len))
        return 0.0

    # Use bigrams (2-character sequences) for longer strings
    def bigrams(s):
        return set(s[i:i+2] for i in range(len(s) - 1))

    b1 = bigrams(s1)
    b2 = bigrams(s2)

    if not b1 or not b2:
        return 0.0

    intersection = len(b1 & b2)
    return 2.0 * intersection / (len(b1) + len(b2))


def fuzzy_match_company(
    company_name: str,
    reference_companies: dict,
    threshold: float = 0.80,
) -> tuple[str | None, float]:
    """Find the best fuzzy match for a company name in the reference list.

    Args:
        company_name: The company name to match
        reference_companies: Dict of normalized_name -> company_info from reference list
        threshold: Minimum similarity score (0.0-1.0) to consider a match

    Returns:
        (matched_canonical_name, score) or (None, 0.0) if no match
    """
    if not company_name:
        return None, 0.0

    normalized = normalize_company_name(company_name)
    if not normalized:
        return None, 0.0

    best_match = None
    best_score = 0.0

    for ref_normalized, ref_info in reference_companies.items():
        # Skip exact matches (those are handled elsewhere)
        if ref_normalized == normalized:
            continue

        score = _similarity_ratio(normalized, ref_normalized)
        if score > best_score and score >= threshold:
            best_score = score
            best_match = ref_info["name"]

    return best_match, best_score


# ---------------------------------------------------------------------------
# Stage A: Title Parsing — extract company from LinkedIn title string
# ---------------------------------------------------------------------------
# Blocklist patterns — credentials, degrees, certifications (not companies)
_NOT_COMPANY_PATTERNS = [
    # Danish degrees
    r"cand\.\w+",        # Cand.Polyt, Cand.Merc, Cand.Arch
    r"\bhd\b", r"\bhd\(",    # HD, HD(O), HD(R)
    # International degrees (need word boundaries on both sides to avoid matching company names)
    r"\bmsc\b", r"\bmba\b", r"\bbsc\b", r"\bphd\b", r"\bba\b", r"\bma\b",
    # Certifications
    r"\bpmp\b", r"prince2", r"scrum", r"itil", r"togaf",
]
_NOT_COMPANY_RE = re.compile(
    "|".join(_NOT_COMPANY_PATTERNS), re.IGNORECASE
)

# Service/tagline keywords that indicate description, not company
# NOTE: "consulting", "services", "advisory" removed - these are common in company names
# (e.g., "NT Consulting", "AM Security Service", "EY Advisory")
_TAGLINE_KEYWORDS = [
    "løsninger", "rådgivning", "ydelser", "solutions",
    "helping", "hjælper",
    "specialist i", "ekspert i", "expert in", "passionate about",
    # Location/site types (not companies)
    "byggepladser", "byggeplads", "sites", "projects", "kontor",
    # Domain/specialty areas
    "fm-systemer", "systemer", "ejendomme", "ejendomsdrift",
    "digitalisering", "facilities", "facility management",
    "byggeprocessen", "byggeriet", "byggebranche",
]

# Company suffix indicators — segments containing these are likely companies
_COMPANY_SUFFIX_PATTERNS = [
    r"\ba/s\b", r"\baps\b", r"\bi/s\b", r"\bk/s\b", r"\bp/s\b",
    r"\bholding\b", r"\bgroup\b", r"\bpartners\b",
    r"\binc\.?\b", r"\bltd\.?\b", r"\bgmbh\b", r"\bab\b",
    r"\bconsulting\s+group\b", r"\bconsulting\b",
]
_COMPANY_SUFFIX_RE = re.compile(
    "|".join(_COMPANY_SUFFIX_PATTERNS), re.IGNORECASE
)

# Words that indicate we should truncate after them (not part of company name)
_TRUNCATE_WORDS = [
    "teknisk", "technical", "servicepartner", "afdeling", "division",
    "department", "branch", "region", "nordic", "nordics", "scandinavia",
]

# Role words expanded — used to detect non-company segments
_ROLE_WORDS_EXTENDED = {
    # English
    "manager", "director", "engineer", "consultant", "specialist",
    "analyst", "developer", "designer", "architect", "adviser",
    "advisor", "coordinator", "lead", "head", "officer", "executive",
    "president", "founder", "owner", "partner", "associate", "senior",
    "junior", "trainee", "intern", "assistant", "supervisor", "foreman",
    # Danish
    "koordinator", "leder", "chef", "ansvarlig", "projektleder",
    "rådgiver", "ingeniør", "konsulent", "direktør", "medarbejder",
    "assistent", "tekniker", "montør", "mester", "svend", "formand",
    "bygningskonstruktør", "konstruktør", "arkitekt", "ejer", "indehaver",
}

# Patterns ordered most‑specific → least‑specific
_TITLE_SEP_RE = re.compile(
    r"""
    (?:^|\s)                       # start or space
    (?:at|hos|for|i|@|ved|på)\s+   # Danish/English prepositions
    (.+)$                          # rest = company
    """,
    re.IGNORECASE | re.VERBOSE,
)

# Explicit @ pattern for higher confidence: "Role @ Company"
_AT_COMPANY_RE = re.compile(
    r"@\s*([A-Za-z0-9æøåÆØÅ][A-Za-z0-9æøåÆØÅ\s&\-\.+]+)",
    re.IGNORECASE,
)

_TITLE_DELIM_RE = re.compile(
    r"""
    \s*[|–—\-]\s*                  # pipe, dash, em‑dash separators
    """,
    re.VERBOSE,
)

_TITLE_COMMA_RE = re.compile(
    r"""
    ,\s*                           # comma separator (lowest priority)
    """,
    re.VERBOSE,
)


def _is_valid_company(candidate: str) -> bool:
    """Check if a candidate string looks like a valid company name."""
    if not candidate or len(candidate) < 2:
        return False

    # Too long — real company names rarely exceed 40 chars
    if len(candidate) > 40:
        return False

    # Too many words — > 5 words usually indicates a slogan
    word_count = len(candidate.split())
    if word_count > 5:
        return False

    # Matches blocklist (degrees/credentials)
    if _NOT_COMPANY_RE.search(candidate):
        return False

    # Contains tagline/service keywords
    lower = candidate.lower()
    if any(kw in lower for kw in _TAGLINE_KEYWORDS):
        return False

    # Any pipes/bullets = list/multiple roles, not a company
    if "|" in candidate or "•" in candidate:
        return False

    return True


def parse_title_for_company(title: str) -> str | None:
    """Extract a company name from a LinkedIn title string (Stage A).

    Handles patterns like:
        "CEO at Molio"
        "Direktør hos KIRKBI"
        "Senior Engineer i Rambøll"
        "Director | Company"
        "Role - Company"
        "Role, Company"
        "Role @ Company"
        "Bygkontrol - Konstruktører" (company first)
        "Key Account Manager Elgiganten A/S" (suffix detection)

    Filters out:
        - Credentials (Cand.Polyt, MSc, MBA, etc.)
        - Taglines/slogans (too long, service keywords)
    """
    if not title:
        return None

    title = title.strip()

    # 0. Explicit @ pattern (highest confidence): "Role @ Company"
    m = _AT_COMPANY_RE.search(title)
    if m:
        candidate = m.group(1).strip()
        candidate = candidate.split("·")[0].strip()
        # Trust @ pattern even for longer names, but still validate basics
        if len(candidate) >= 2 and not _NOT_COMPANY_RE.search(candidate):
            return candidate

    # 1. Preposition pattern with smart truncation: "Role at/hos/i Company..."
    m = _TITLE_SEP_RE.search(title)
    if m:
        candidate = m.group(1).strip()
        # Clean trailing noise (middot, etc.)
        candidate = candidate.split("·")[0].strip()

        # Always truncate at noise patterns first (/, |, etc.)
        candidate = _truncate_at_noise(candidate)

        if _is_valid_company(candidate):
            return candidate

    # 2. Company suffix detection: scan for "... CompanyName A/S" anywhere
    suffix_company = _extract_company_from_suffix_match(title)
    if suffix_company:
        return suffix_company

    # 3. Company-first pattern: "CompanyName - Role descriptions"
    first_segment = _extract_company_first_segment(title)
    if first_segment:
        return first_segment

    # 4. Delimiter pattern: "Role | Company" / "Role – Company" / "Role - Company"
    parts = _TITLE_DELIM_RE.split(title)
    if len(parts) >= 2:
        # Try segments from right to left (last is usually company, but may be region)
        for i in range(len(parts) - 1, 0, -1):
            candidate = parts[i].strip().split("·")[0].strip()
            if _is_valid_company(candidate) and _looks_like_company_name(candidate):
                return candidate

    # 5. Comma pattern: "Role, Company"
    parts = _TITLE_COMMA_RE.split(title)
    if len(parts) >= 2:
        candidate = parts[-1].strip().split("·")[0].strip()
        if _is_valid_company(candidate) and _looks_like_company_name(candidate):
            return candidate

    # When in doubt, return None (better blank than wrong)
    return None


def _looks_like_role(text: str) -> bool:
    """Heuristic: does the text look more like a job role than a company?"""
    role_words = _ROLE_WORDS_EXTENDED
    lower = text.lower()
    return any(w in lower for w in role_words)


def _has_company_suffix(text: str) -> bool:
    """Check if text contains a company suffix indicator (A/S, Group, etc.)."""
    return bool(_COMPANY_SUFFIX_RE.search(text))


def _is_preposition_based_extraction(title: str, extracted: str) -> bool:
    """Check if the extracted company came from a preposition pattern.

    Returns True if title contains "at/hos/for/ved/i/@ {extracted}", indicating
    high confidence that the extraction is a real company name.
    """
    if not title or not extracted:
        return False

    import re
    # Escape special regex chars in extracted company name
    escaped = re.escape(extracted.rstrip('.'))  # Remove trailing dot for matching

    # Check if title contains preposition + extracted company
    pattern = rf'(?:^|\s)(?:at|hos|for|ved|i|@)\s+{escaped}'
    return bool(re.search(pattern, title, re.IGNORECASE))


def _extract_company_from_suffix_match(title: str) -> str | None:
    """Scan title for segments containing company suffixes.

    Handles: "Key Account Manager B2B Elgiganten A/S"
    Returns: "Elgiganten A/S"
    """
    # Split by common delimiters
    segments = re.split(r'\s*[|–—]\s*|\s*,\s*', title)

    for segment in segments:
        segment = segment.strip()
        if _has_company_suffix(segment):
            # Look for company name pattern: 1-3 capitalized words followed by suffix
            # This is more restrictive to avoid capturing too much
            match = re.search(
                r'([A-ZÆØÅ][a-zæøå]+(?:\s+(?:&\s+)?[A-ZÆØÅ]?[a-zæøå]+){0,2})\s+(A/S|ApS|I/S|K/S|P/S|Holding|Group|Partners|Inc\.?|Ltd\.?|GmbH|AB)\b',
                segment,
                re.IGNORECASE
            )
            if match:
                candidate = f"{match.group(1)} {match.group(2)}".strip()
                if len(candidate) >= 3 and not _looks_like_role(candidate):
                    return candidate

            # Also try: capitalized word(s) directly before suffix at end of segment
            match2 = re.search(
                r'([A-ZÆØÅ][A-Za-zæøåÆØÅ0-9&]+(?:\s+[A-ZÆØÅ]?[A-Za-zæøåÆØÅ0-9&]+)?)\s+(A/S|ApS|I/S)\s*$',
                segment,
                re.IGNORECASE
            )
            if match2:
                candidate = f"{match2.group(1)} {match2.group(2)}".strip()
                if len(candidate) >= 3 and not _looks_like_role(candidate):
                    return candidate

    return None


def _truncate_at_noise(text: str) -> str:
    """Truncate company candidate at noise words/patterns.

    "CBRE teknisk Servicepartner" -> "CBRE"
    "TD-K | Maskinmester" -> "TD-K"
    "Nasdaq | Tech Investor" -> "Nasdaq"
    "Nordstern / Bygningskonstruktør" -> "Nordstern"

    But preserve company suffixes like A/S, I/S, K/S, P/S.
    """
    # First, split at pipe/dash/slash delimiters (must have space around them)
    # But NOT for company suffixes like "A/S" - use negative lookbehind
    text = re.split(r'\s+[|–—]\s+|\s+/\s+', text)[0].strip()
    # Split at pipe without spaces, but NOT slash (to preserve A/S, I/S, etc.)
    text = re.split(r'\s*\|\s*', text)[0].strip()
    # Only split at slash if it's clearly a separator (space before and after, or word/word pattern)
    # Don't split patterns like "A/S", "I/S", "K/S", "P/S"
    text = re.split(r'(?<![AIKPaikp])/(?![Ss])', text)[0].strip()

    # Then truncate at known noise words
    lower = text.lower()
    for noise_word in _TRUNCATE_WORDS:
        idx = lower.find(noise_word)
        if idx > 2:  # Must have at least some company name before
            text = text[:idx].strip()
            lower = text.lower()

    # Truncate at second preposition (in, for, with, etc.)
    second_prep = re.search(r'\s+(in|for|with|med|til|og)\s+', text, re.IGNORECASE)
    if second_prep and second_prep.start() > 2:
        text = text[:second_prep.start()].strip()

    return text


def _extract_company_first_segment(title: str) -> str | None:
    """Check if first segment (before dash/pipe) looks like a company.

    Handles: "Bygkontrol - Konstruktører og Ingeniører"
    Returns: "Bygkontrol"
    """
    # Split by dash or pipe (but not hyphenated words like "HSE-Manager")
    # Only split if there's whitespace around the separator
    parts = re.split(r'\s+[-–—|]\s+', title)
    if len(parts) < 2:
        return None

    first = parts[0].strip()

    # First segment should look like company (not role)
    if len(first) < 3 or len(first) > 35:
        return None

    # Reject common acronym-role patterns (HSE, IT, HR, etc.)
    acronym_role_pattern = r'^[A-Z]{2,4}$'
    if re.match(acronym_role_pattern, first):
        return None

    # Should not start with role words
    if _looks_like_role(first):
        return None

    # Should look like a proper noun (starts with capital, mostly letters)
    if not first[0].isupper():
        return None

    # Should not be too many words (likely a description)
    if len(first.split()) > 4:
        return None

    # Check if it has company suffix (high confidence)
    if _has_company_suffix(first):
        return first

    # Otherwise, check if the REST looks like roles/descriptions
    rest = " ".join(parts[1:]).lower()
    role_indicators = ["konstruktør", "ingeniør", "rådgiver", "konsulent",
                       "specialist", "manager", "director", "engineer",
                       "technologist", "coordinator", "leder", "chef"]
    if any(ri in rest for ri in role_indicators):
        # The rest is roles, so first is likely company
        return first

    return None


def _is_high_confidence_company(
    company: str,
    ref: dict | None = None,
) -> bool:
    """Check if a Stage A extracted company is high-confidence (no validation needed).

    High confidence if ANY of:
    - Has company suffix (A/S, ApS, Group, Inc, etc.)
    - Matches reference list (exact, normalized, or alias)
    - Contains "&" in proper company pattern (e.g., "Enemærke & Petersen")

    Low confidence (needs web validation) if:
    - Single word without suffix (could be job title like "Byggeleder")
    - Doesn't match reference list
    - Multi-word but no suffix or & pattern
    """
    if not company or len(company) < 2:
        return False

    # High confidence: Has company suffix
    if _has_company_suffix(company):
        return True

    # High confidence: Matches reference list
    if ref:
        company_lower = company.lower()
        company_normalized = normalize_company_name(company)

        # Exact match
        if company_lower in ref.get("companies", {}):
            return True

        # Normalized match
        if company_normalized in ref.get("companies_normalized", {}):
            return True

        # Alias match
        aliases = ref.get("aliases", {})
        if company_lower in aliases or company_normalized in aliases:
            return True

    # High confidence: Contains "&" (e.g., "Enemærke & Petersen")
    words = company.split()
    if "&" in company and len(words) >= 2:
        # Check it's not a tagline like "Innovation & Strategy"
        lower = company.lower()
        tagline_words = ["innovation", "strategy", "solutions", "services", "consulting"]
        if not any(tw in lower for tw in tagline_words):
            return True

    # Everything else is low confidence - needs web validation
    # This includes:
    # - Single words like "Byggeleder", "COWI", "Rambøll" (could be job title or company)
    # - Multi-word without suffix like "MT Højgaard" (probably company but validate)
    return False


def _looks_like_company_name(text: str) -> bool:
    """Check if text looks like a company name (not a role/skill/description)."""
    if not text or len(text) < 3:
        return False

    lower = text.lower()

    # Has company suffix = definitely company
    if _has_company_suffix(text):
        return True

    # Looks like a role = not company
    if _looks_like_role(text):
        return False

    # Blocklist: generic words/phrases that are NOT companies
    not_company_phrases = [
        # English
        "building", "materials", "services", "solutions", "products",
        "management", "development", "innovation", "strategy", "operations",
        "opportunities", "investor", "investments", "ventures",
        "mentor", "coach", "trainer", "speaker", "author",
        "freelance", "selvstændig", "independent", "retired", "pensionist",
        "enthusiast", "expert", "specialist", "professional", "passionate",
        # Role/title words that get parsed as companies
        "officer", "owner", "manager", "director", "leader", "head of",
        "product owner", "project manager", "team lead", "architect",
        # Danish skill/discipline words (often in pipe-separated titles)
        "procesudvikling", "formidling", "udvikling", "optimering",
        "ledelse", "rådgivning", "undervisning", "projektledelse",
        "kvalitet", "bæredygtighed", "digitalisering", "sikkerhed",
        "compliance", "processer", "produktion", "drift", "logistik",
        "indkøb", "salg", "marketing", "kommunikation", "strategi",
        "forretningsudvikling", "transformation", "implementering",
        "analyse", "data", "teknologi", "it", "software", "hardware",
        "iot", "automation", "integration", "agile", "scrum", "devops",
        # Geographic regions (not companies)
        "dach", "nordics", "scandinavia", "emea", "apac", "latam",
        "denmark", "danmark", "sweden", "norge", "norway", "finland",
        # Academic degrees parsed as companies
        "cand.polyt", "cand.arch", "msc", "mba", "phd", "bsc",
        # Location/site types (not companies)
        "byggepladser", "byggeplads", "sites", "site", "projects",
        "kontor", "office", "remote", "hybrid",
        # Domain/specialty areas (not companies)
        "fm-systemer", "systemer", "system", "platforme",
        "fagchef", "chefrådgiver", "chefkonsulent", "seniorkonsulent",
        "ejendomme", "ejendomsdrift", "facilities", "facility management",
        "digitalisering", "bæredygtighed", "sustainability",
        # Learned from review corrections
        "solenergi", "bæredygtig fm", "byggeprocessen", "byggeriet",
        "calibration", "maintenance", "innovative", "problem solver",
        "samarbejde", "vejen frem", "koncernøkonomi", "indkøb",
        "spildevand", "drift", "ledelse dl", "diplom",
        # Topic/specialty areas (not companies)
        "climate change", "wastewater", "adaption", "adaptation",
        "energy efficiency", "renewable", "carbon", "emissions",
        "circular economy", "green transition", "biodiversity",
        # Common Danish non-company words (from review 2025-02)
        "arbejdsmiljø", "arbejdsmiljo", "tekniske alarmer", "tekniske",
        "handlekraftig", "energisk", "entusiastisk",
        "certificeret", "mægler", "mediator", "collaboration",
        "rådgiver", "specialist", "konsulent", "koordinator",
        "brandsikkerhed", "brand", "alarm", "adgangskontrol",
        "tv-overvågning", "vagtydelser", "sikringsløsninger",
        "vidensbaseret", "sikkerhedsrådgivning", "compliance og kædeansvar",
        "produktionsoptimering", "medarbejder-it",
        # Geographic regions
        "jylland", "sjælland", "fyn", "zealand", "copenhagen",
        # Role fragments
        "pa i ceo", "ea i pa",
    ]
    if any(phrase in lower for phrase in not_company_phrases):
        return False

    # Check for company-like patterns
    # - Is a known org pattern like "videnscenter", "lab", "center"
    company_indicators = ["center", "centre", "lab", "videnscenter",
                          "institut", "forening", "forbund", "fond"]
    if any(ci in lower for ci in company_indicators):
        return True

    # & is a company indicator ONLY if it looks like "Company1 & Company2" pattern
    # Not for geographic regions like "DACH & Denmark" or taglines
    if "&" in text:
        # Check if it's a proper company name pattern (capitalized words around &)
        parts = text.split("&")
        if len(parts) == 2:
            left = parts[0].strip()
            right = parts[1].strip()
            # Both parts should be short, capitalized, and not geographic
            if (left and right and
                left[0].isupper() and right[0].isupper() and
                len(left.split()) <= 2 and len(right.split()) <= 2 and
                not any(geo in lower for geo in ["denmark", "danmark", "norway", "sweden", "dach"])):
                return True

    # Must start with capital and be reasonable length
    if not text[0].isupper():
        return False

    # Single generic words are usually not companies
    if len(text.split()) == 1 and len(text) < 6:
        return False

    return len(text.split()) <= 4


# ---------------------------------------------------------------------------
# Stage B: Brave Web Search API Lookup
# ---------------------------------------------------------------------------
_SEARCH_CACHE: dict[str, str | None] = {}
_MONTHLY_SEARCH_LIMIT = 2000
_SEARCH_USAGE_FILE = Path(__file__).resolve().parent / ".brave_search_usage.json"
_LAST_SEARCH_TIME: float = 0.0  # Rate limiting: 1 query/second


def _get_search_usage() -> dict:
    """Get this month's search usage from file (or DB)."""
    if _USE_DB:
        from icp_db import get_search_usage
        return get_search_usage(_get_db_session(), "brave_search")

    current_month = datetime.now().strftime("%Y-%m")

    if _SEARCH_USAGE_FILE.exists():
        try:
            import json
            with open(_SEARCH_USAGE_FILE, "r") as f:
                usage = json.load(f)
            # Reset if it's a new month
            if usage.get("month") != current_month:
                return {"month": current_month, "count": 0, "limit_notified": False}
            return usage
        except Exception:
            pass

    return {"month": current_month, "count": 0, "limit_notified": False}


def _save_search_usage(usage: dict) -> None:
    """Save search usage to file (or DB)."""
    if _USE_DB:
        from icp_db import save_search_usage
        save_search_usage(
            _get_db_session(), "brave_search",
            usage.get("month", datetime.now().strftime("%Y-%m")),
            usage.get("count", 0),
            usage.get("limit_notified", False),
        )
        return

    try:
        import json
        with open(_SEARCH_USAGE_FILE, "w") as f:
            json.dump(usage, f)
    except Exception:
        pass


def _increment_search_usage() -> tuple[int, bool]:
    """Increment usage counter. Returns (new_count, limit_reached)."""
    if _USE_DB:
        from icp_db import increment_search_usage
        return increment_search_usage(_get_db_session(), "brave_search")

    usage = _get_search_usage()
    usage["count"] += 1
    limit_reached = usage["count"] >= _MONTHLY_SEARCH_LIMIT
    _save_search_usage(usage)
    return usage["count"], limit_reached


def _check_search_limit() -> tuple[bool, int]:
    """Check if monthly limit reached. Returns (can_search, remaining)."""
    usage = _get_search_usage()
    remaining = _MONTHLY_SEARCH_LIMIT - usage["count"]
    return remaining > 0, remaining


def brave_lookup_company(
    name: str,
    title: str = "",
    *,
    api_key: str | None = None,
    verbose: bool = True,
) -> str | None:
    """Search Brave Web Search API for a person's company.

    Uses Brave Search API (2000 free queries/month).
    Searches for the person's name + title keywords.

    Args:
        name: Person's full name
        title: Person's title (used as additional search context)
        api_key: Brave API key (or set BRAVE_API_KEY env var)
        verbose: Print status messages

    Returns:
        Company name if found, None otherwise.
    """
    # Check cache first
    cache_key = f"{name}|{title}"
    if cache_key in _SEARCH_CACHE:
        return _SEARCH_CACHE[cache_key]

    # Check monthly limit
    can_search, remaining = _check_search_limit()
    if not can_search:
        usage = _get_search_usage()
        if not usage.get("limit_notified"):
            print(f"\n⚠️  MONTHLY SEARCH LIMIT REACHED ({_MONTHLY_SEARCH_LIMIT} searches)")
            print("    Web search disabled for this month. Resets on the 1st.")
            print("    Contacts without companies will go to Review.\n")
            usage["limit_notified"] = True
            _save_search_usage(usage)
        _SEARCH_CACHE[cache_key] = None
        return None

    # Get API credentials
    api_key = api_key or os.environ.get("BRAVE_API_KEY")

    if not api_key:
        # Silently skip if credentials not configured
        _SEARCH_CACHE[cache_key] = None
        return None

    try:
        import requests
    except ImportError:
        _SEARCH_CACHE[cache_key] = None
        return None

    # Build search query - name + key title words + LinkedIn
    # Extract meaningful words from title for better search
    title_words = []
    if title:
        # Remove common noise and extract key terms
        clean_title = re.sub(r'[|•·–—\-,]', ' ', title)
        words = clean_title.split()
        # Keep meaningful words (skip short ones and common prepositions)
        skip_words = {'at', 'hos', 'i', 'the', 'and', 'or', 'for', 'in', 'of', 'a', 'an'}
        title_words = [w for w in words[:5] if len(w) > 2 and w.lower() not in skip_words]

    # Search with site preference for high-quality sources
    query = f'"{name}" {" ".join(title_words[:3])} (site:rocketreach.co OR site:theorg.com OR site:linkedin.com)'

    try:
        # Rate limiting: max 1 query per second (Brave free tier limit)
        global _LAST_SEARCH_TIME
        elapsed = time.time() - _LAST_SEARCH_TIME
        if elapsed < 1.1:  # Add small buffer
            time.sleep(1.1 - elapsed)
        _LAST_SEARCH_TIME = time.time()

        resp = requests.get(
            "https://api.search.brave.com/res/v1/web/search",
            headers={
                "X-Subscription-Token": api_key,
                "Accept": "application/json",
            },
            params={
                "q": query,
                "count": 5,  # Get top 5 results
            },
            timeout=10,
        )
        resp.raise_for_status()
        data = resp.json()

        # Increment usage counter
        count, limit_reached = _increment_search_usage()
        if verbose and count % 50 == 0:
            print(f"    [Brave Search: {count}/{_MONTHLY_SEARCH_LIMIT} queries used this month]")
        if limit_reached and verbose:
            print(f"\n⚠️  MONTHLY SEARCH LIMIT REACHED ({_MONTHLY_SEARCH_LIMIT} searches)")

        # Parse results - look for company in titles and snippets
        web_results = data.get("web", {}).get("results", [])
        for result in web_results:
            snippet = result.get("description", "")
            title_text = result.get("title", "")

            # Clean up HTML entities
            snippet = snippet.replace("&amp;", "&").replace("<strong>", "").replace("</strong>", "")
            title_text = title_text.replace("&amp;", "&")

            # Common formats:
            # Title: "Name - Title at Company | LinkedIn"
            # Title: "Name - Title @ Company - Crunchbase"
            # Snippet: "Name is the CEO at Company..."

            # Pattern 1: "at/hos/@ Company" in title (most reliable)
            title_at_match = re.search(
                r'(?:at|hos|@)\s+([A-ZÆØÅ][A-Za-zæøåÆØÅ0-9\s&\-\.]+?)(?:\s*[|·\-–—]|$)',
                title_text,
                re.IGNORECASE,
            )
            if title_at_match:
                company = title_at_match.group(1).strip()
                if 2 <= len(company) <= 60 and not _looks_like_role(company):
                    _SEARCH_CACHE[cache_key] = company
                    return company

            # Pattern 2: "at/hos/@ Company" in snippet
            snippet_match = re.search(
                r'(?:at|hos|@)\s+([A-ZÆØÅ][A-Za-zæøåÆØÅ0-9\s&\-\.]+?)(?:\s*[·|,\.]|$)',
                snippet,
                re.IGNORECASE,
            )
            if snippet_match:
                company = snippet_match.group(1).strip()
                if 2 <= len(company) <= 60 and not _looks_like_role(company):
                    _SEARCH_CACHE[cache_key] = company
                    return company

            # Pattern 3: "Name - Company | LinkedIn/RocketReach/TheOrg" title format
            # e.g., "Anders Skipper - Dalux | LinkedIn"
            linkedin_title_match = re.search(
                r'^[^|]+[-–—]\s*([A-ZÆØÅ][A-Za-zæøåÆØÅ0-9\s&\-\.]+?)\s*\|\s*(?:LinkedIn|RocketReach|The Org)',
                title_text,
                re.IGNORECASE,
            )
            if linkedin_title_match:
                company = linkedin_title_match.group(1).strip()
                if 2 <= len(company) <= 60 and not _looks_like_role(company):
                    _SEARCH_CACHE[cache_key] = company
                    return company

            # Pattern 4: Look for "CEO/Founder/etc at/of Company" in snippet
            role_company_match = re.search(
                r'(?:CEO|Founder|Director|Manager|Head|Chief)\s+(?:at|of|&|and)\s+([A-ZÆØÅ][A-Za-zæøåÆØÅ0-9\s&\-\.]+?)(?:\s*[·|,\.]|$)',
                snippet,
                re.IGNORECASE,
            )
            if role_company_match:
                company = role_company_match.group(1).strip()
                if 2 <= len(company) <= 60 and not _looks_like_role(company):
                    _SEARCH_CACHE[cache_key] = company
                    return company

            # Pattern 5: RocketReach format - "Name's email & phone | Company"
            rocketreach_match = re.search(
                r"'s\s+(?:email|phone)[^|]*\|\s*([A-ZÆØÅ][A-Za-zæøåÆØÅ0-9\s&\-\.]+)",
                title_text,
                re.IGNORECASE,
            )
            if rocketreach_match:
                company = rocketreach_match.group(1).strip()
                if 2 <= len(company) <= 60 and not _looks_like_role(company):
                    _SEARCH_CACHE[cache_key] = company
                    return company

    except Exception as e:
        if verbose:
            print(f"    [Brave Search error: {e}]")

    _SEARCH_CACHE[cache_key] = None
    return None


# Aliases for backwards compatibility
google_lookup_company = brave_lookup_company
bing_lookup_company = brave_lookup_company


def get_search_usage_status() -> str:
    """Get current search usage status as a string."""
    usage = _get_search_usage()
    remaining = _MONTHLY_SEARCH_LIMIT - usage["count"]
    return f"Brave Search: {usage['count']}/{_MONTHLY_SEARCH_LIMIT} used this month ({remaining} remaining)"


# ---------------------------------------------------------------------------
# Stage C: CVR API Lookup (Danish Business Registry)
# ---------------------------------------------------------------------------
_CVR_CACHE: dict[str, str | None] = {}
_CVR_API_URL = "https://cvrapi.dk/api"
_CVR_USER_AGENT = "SiteHub ICP Tracker/1.0 (contact: marketing@sitehub.dk)"


def cvr_lookup_company(candidate: str, *, delay: float = 0.5) -> str | None:
    """Validate/lookup a company name against the Danish CVR registry.

    Uses cvrapi.dk free API (50 lookups/day).
    Returns the official company name if found, None otherwise.

    Args:
        candidate: The company name candidate to validate
        delay: Seconds to wait between API calls (rate limiting)
    """
    if not candidate or len(candidate) < 2:
        return None

    # Check cache first
    cache_key = candidate.lower().strip()
    if cache_key in _CVR_CACHE:
        return _CVR_CACHE[cache_key]

    try:
        import requests
    except ImportError:
        return None

    try:
        time.sleep(delay)
        resp = requests.get(
            _CVR_API_URL,
            params={
                "search": candidate,
                "country": "dk",
            },
            headers={"User-Agent": _CVR_USER_AGENT},
            timeout=10,
        )

        if resp.status_code == 200:
            data = resp.json()
            # API returns company data directly if found
            if data and "name" in data:
                official_name = data["name"]
                _CVR_CACHE[cache_key] = official_name
                return official_name

        # No match found
        _CVR_CACHE[cache_key] = None
        return None

    except Exception:
        _CVR_CACHE[cache_key] = None
        return None


def cvr_fuzzy_validate(candidate: str, threshold: float = 0.75) -> str | None:
    """Validate company candidate against CVR with fuzzy matching.

    If the candidate closely matches a CVR company name, return the official name.
    Uses similarity ratio to allow for minor differences.
    """
    official = cvr_lookup_company(candidate)
    if not official:
        return None

    # Check similarity between candidate and official name
    candidate_norm = normalize_company_name(candidate)
    official_norm = normalize_company_name(official)

    similarity = _similarity_ratio(candidate_norm, official_norm)
    if similarity >= threshold:
        return official

    return None


def _extract_potential_company_candidates(title: str) -> list[str]:
    """Extract potential company name candidates from a title for CVR validation.

    Returns a list of strings that might be company names, ordered by likelihood.
    These are segments that didn't pass normal extraction but could be companies.
    """
    if not title:
        return []

    candidates = []

    # Split by common delimiters
    segments = re.split(r'\s*[|–—]\s*|\s*[,;]\s*', title)

    for segment in segments:
        segment = segment.strip()
        if not segment or len(segment) < 3:
            continue

        # Skip if it clearly looks like a role
        if _looks_like_role(segment):
            continue

        # Skip if it's a tagline/description (too long, multiple sentences)
        if len(segment) > 50 or segment.count('.') > 1:
            continue

        # Skip common non-company phrases
        lower = segment.lower()
        skip_phrases = [
            "helping", "passionate", "looking for", "søger",
            "years", "experience", "expert", "specialist",
            "building", "creating", "driving", "making",
        ]
        if any(p in lower for p in skip_phrases):
            continue

        # Good candidate - starts with capital, reasonable length
        if segment[0].isupper() and len(segment.split()) <= 5:
            candidates.append(segment)

    # Also try first segment for "CompanyName - roles" pattern
    parts = re.split(r'\s+[-–—]\s+', title)
    if len(parts) >= 2:
        first = parts[0].strip()
        if (first and len(first) >= 3 and first[0].isupper()
            and not _looks_like_role(first) and first not in candidates):
            candidates.insert(0, first)  # Higher priority

    return candidates[:5]  # Limit to 5 candidates to preserve API quota


# ---------------------------------------------------------------------------
# Enrichment pipeline
# ---------------------------------------------------------------------------
def enrich_missing_companies(
    contacts: list[dict],
    *,
    ref: dict | None = None,
    use_web: bool = False,
    use_cvr: bool = False,
    verbose: bool = True,
    new_contacts_only: set[str] | None = None,
) -> tuple[int, int, int, int]:
    """Run Stage A (title parsing), optionally Stage B (web), and Stage C (CVR).

    Stage A extractions are validated:
    - High confidence (has suffix, matches reference): accepted directly
    - Low confidence: validated via web search if enabled

    Args:
        new_contacts_only: If provided, only do web/CVR lookups for contacts whose
                          name.lower() is in this set. Title parsing (Stage A) still
                          runs for all contacts.

    Modifies contacts in‑place. Returns (stage_a_count, stage_b_count, stage_c_count, validated_count).
    """
    stage_a = 0
    stage_b = 0
    stage_c = 0
    validated_count = 0  # Stage A extractions validated via web

    for c in contacts:
        if c.get("Company"):
            continue

        # Stage A: Title parsing
        company = parse_title_for_company(c.get("Title", ""))
        if company:
            # Check if extraction is high confidence
            if _is_high_confidence_company(company, ref):
                c["Company"] = company
                stage_a += 1
                continue
            else:
                # Low confidence - validate via CVR API first
                cvr_validated = cvr_fuzzy_validate(company, threshold=0.70)
                if cvr_validated:
                    # CVR confirms it's a real company
                    c["Company"] = cvr_validated
                    stage_a += 1
                    validated_count += 1
                    if verbose:
                        print(f"    CVR validated: '{company}' -> '{cvr_validated}'")
                    continue
                # CVR didn't validate - discard this extraction
                # Will fall through to web search or remain empty

        # Check if this contact should get web/CVR lookups
        # If new_contacts_only is set, skip API lookups for existing contacts
        contact_key = (c.get("Name") or "").strip().lower()
        is_new_contact = new_contacts_only is None or contact_key in new_contacts_only

        # Stage C: CVR lookup for uncertain candidates (before web search)
        # Try to extract potential company segments from title and validate against CVR
        if use_cvr and is_new_contact:
            title = c.get("Title", "")
            # Extract candidate segments that might be companies
            candidates = _extract_potential_company_candidates(title)
            for candidate in candidates:
                validated = cvr_fuzzy_validate(candidate, threshold=0.70)
                if validated:
                    c["Company"] = validated
                    stage_c += 1
                    break
            if c.get("Company"):
                continue

        # Stage B: Web search as fallback (when Stage A found nothing)
        if use_web and is_new_contact:
            company = google_lookup_company(c.get("Name", ""), c.get("Title", ""), verbose=verbose)
            if company:
                c["Company"] = company
                stage_b += 1

    if verbose:
        print(f"  Stage A (title parsing): filled {stage_a} companies")
        if validated_count > 0:
            print(f"    (CVR validated {validated_count} uncertain extractions)")
        if use_cvr:
            print(f"  Stage C (CVR lookup):    filled {stage_c} companies")
        if use_web:
            print(f"  Stage B (web fallback):  filled {stage_b} companies")

    return stage_a, stage_b, stage_c, validated_count


# ---------------------------------------------------------------------------
# Reference Lists loader
# ---------------------------------------------------------------------------
# Module-level caches for quick company lookups during parsing
_ref_companies_cache: set = set()  # lowercase company names
_ref_companies_norm_cache: set = set()  # normalized company names
_tracker_companies_cache: set = set()  # companies already in tracker (all statuses)
_company_blocklist: set = set()  # incorrectly extracted "companies" to reject
_COMPANY_BLOCKLIST_FILE = Path(__file__).resolve().parent / ".company_blocklist.txt"


def _load_company_blocklist() -> None:
    """Load the company blocklist from file (or DB)."""
    global _company_blocklist
    if _USE_DB:
        from icp_db import load_company_blocklist
        _company_blocklist = load_company_blocklist(_get_db_session())
        return

    if _COMPANY_BLOCKLIST_FILE.exists():
        try:
            content = _COMPANY_BLOCKLIST_FILE.read_text(encoding="utf-8")
            _company_blocklist = {line.strip().lower() for line in content.splitlines() if line.strip()}
        except Exception:
            pass


def _add_to_company_blocklist(bad_company: str) -> None:
    """Add an incorrectly extracted company name to the blocklist."""
    global _company_blocklist
    if not bad_company or len(bad_company) < 2:
        return

    clean = bad_company.strip().lower()
    if clean and clean not in _company_blocklist:
        _company_blocklist.add(clean)
        # Also add normalized version
        normalized = normalize_company_name(bad_company)
        if normalized:
            _company_blocklist.add(normalized)

        if _USE_DB:
            from icp_db import add_to_company_blocklist
            add_to_company_blocklist(_get_db_session(), clean)
            if normalized:
                add_to_company_blocklist(_get_db_session(), normalized)
            return

        # Persist to file
        try:
            with open(_COMPANY_BLOCKLIST_FILE, "a", encoding="utf-8") as f:
                f.write(f"{clean}\n")
        except Exception:
            pass


def _strip_emojis(text: str) -> str:
    """Remove emojis and other non-ASCII symbols from text."""
    import re
    # Remove emojis and other symbols (keeps basic Latin, extended Latin, numbers, punctuation)
    return re.sub(r'[^\w\s\-\.\,\&\|\@\#\(\)\[\]\{\}\'\"\:\;\/\\]', '', text, flags=re.UNICODE).strip()


def _load_tracker_companies_cache(tracker_path: Path | None = None) -> None:
    """Load company names from tracker into cache for validation during parsing."""
    global _tracker_companies_cache

    if _USE_DB:
        from icp_db import load_tracker_companies_cache
        _tracker_companies_cache = load_tracker_companies_cache(_get_db_session())
        return

    tracker_path = Path(tracker_path or TRACKER_PATH)

    if not tracker_path.exists():
        return

    try:
        wb = openpyxl.load_workbook(tracker_path, read_only=True, data_only=True)
        ws = wb["Contacts"]
        headers = [cell.value for cell in ws[1]]
        company_col = headers.index("Company") if "Company" in headers else None

        if company_col is not None:
            for row in ws.iter_rows(min_row=2, values_only=True):
                company = row[company_col]
                if company and isinstance(company, str) and len(company) > 1:
                    # Strip emojis before adding to cache
                    clean_company = _strip_emojis(company)
                    if clean_company and len(clean_company) > 1:
                        _tracker_companies_cache.add(clean_company.lower().strip())
                        # Also add normalized version
                        normalized = normalize_company_name(clean_company)
                        if normalized:
                            _tracker_companies_cache.add(normalized)
        wb.close()
    except Exception:
        pass  # Silently fail if tracker can't be read


def load_reference_lists(path: str | Path | None = None) -> dict:
    """Load ICP_Reference_Lists.xlsx into memory (or from DB).

    Returns dict with:
        - companies: dict[lowercase_name] -> {name, tier, qualifying_titles}
        - companies_normalized: dict[normalized_name] -> {name, tier, ...}
        - aliases: dict[alias_lowercase] -> canonical_name
        - job_titles: list of {pattern, status, priority}
        - people: dict[lowercase_name] -> {status, company}
    """
    global _ref_companies_cache, _ref_companies_norm_cache

    if _USE_DB:
        import icp_db
        session = _get_db_session()
        ref = icp_db.load_reference_lists(session)
        # Populate module-level caches (same as Excel path below)
        _ref_companies_cache = set(ref["companies"].keys())
        _ref_companies_norm_cache = set(ref["companies_normalized"].keys())
        _load_tracker_companies_cache()
        _load_company_blocklist()
        return ref

    path = Path(path or REFERENCE_PATH)
    wb = openpyxl.load_workbook(path, read_only=True, data_only=True)

    # --- Companies ---
    ws = wb["Companies"]
    companies = {}
    companies_normalized = {}  # For fuzzy matching
    aliases = {}  # alias -> canonical name

    for row in ws.iter_rows(min_row=2, values_only=True):
        # Columns: Name, Tier, Qualifying Titles, Aliases (optional), Notes
        row_list = list(row) + [None] * 5
        name, tier, qualifying_titles, aliases_str, *_ = row_list[:5]

        if name:
            name_clean = name.strip()
            name_lower = name_clean.lower()
            name_normalized = normalize_company_name(name_clean)

            company_info = {
                "name": name_clean,
                "tier": tier,
                "qualifying_titles": (
                    [t.strip().lower() for t in qualifying_titles.split(",")]
                    if qualifying_titles
                    else []
                ),
            }

            # Store by lowercase name (exact match)
            companies[name_lower] = company_info

            # Store by normalized name (for fuzzy matching)
            if name_normalized:
                companies_normalized[name_normalized] = company_info

            # Parse and store aliases
            if aliases_str:
                for alias in str(aliases_str).split(","):
                    alias_clean = alias.strip()
                    if alias_clean:
                        alias_lower = alias_clean.lower()
                        alias_normalized = normalize_company_name(alias_clean)
                        aliases[alias_lower] = name_clean
                        if alias_normalized:
                            aliases[alias_normalized] = name_clean

    # --- Job Titles ---
    ws = wb["Job Titles"]
    job_titles = []
    for row in ws.iter_rows(min_row=2, values_only=True):
        pattern, status, priority, *_ = (list(row) + [None] * 4)[:4]
        if pattern:
            job_titles.append(
                {
                    "pattern": pattern.strip().lower(),
                    "status": status.strip() if status else "Review",
                    "priority": int(priority) if priority else 50,
                }
            )
    # Sort by priority (lower = higher priority)
    job_titles.sort(key=lambda x: x["priority"])

    # --- People ---
    ws = wb["People"]
    people = {}
    for row in ws.iter_rows(min_row=2, values_only=True):
        pname, status, company, *_ = (list(row) + [None] * 4)[:4]
        if pname and pname != "Add specific people here":
            people[pname.strip().lower()] = {
                "status": status.strip() if status else "Review",
                "company": company,
            }

    wb.close()

    # Populate module-level caches for quick lookups during parsing
    _ref_companies_cache = set(companies.keys())
    _ref_companies_norm_cache = set(companies_normalized.keys())

    # Also load companies from tracker (includes non-ICP companies)
    _load_tracker_companies_cache()

    # Load blocklist of incorrectly extracted "companies"
    _load_company_blocklist()

    return {
        "companies": companies,
        "companies_normalized": companies_normalized,
        "aliases": aliases,
        "job_titles": job_titles,
        "people": people,
    }


# ---------------------------------------------------------------------------
# Classification
# ---------------------------------------------------------------------------
_EXCLUSION_PATTERNS = [
    "student", "studerende", "ledig", "job seeker", "jobsøgende",
    "retired", "pensioneret", " intern", "praktikant", "trainee",  # " intern" with space to avoid matching "international"
    "looking for", "søger", "seeking", "between jobs", "in transition",
    "career break",
]

# Companies that are competitors or otherwise excluded from ICP
_EXCLUDED_COMPANIES = [
    "dalux",  # ConTech competitor
    "procore", "aconex", "plangrid", "autodesk",  # Software competitors
    "selvstændig", "self-employed", "freelance", "independent",  # Freelancers
]

# Companies that should be classified as POI (industry contacts, not customers)
_POI_COMPANIES = [
    # ConTech / software (industry contacts worth tracking)
    "letsbuild", "contechlab",
    # Industry associations
    "di byggeri", "dansk industri", "molio", "bygherreforeningen",
    "danske arkitektvirksomheder", "foreningen af rådgivende ingeniører",
    # Universities
    "iha", "dtu", "aau", "ku", "sdu", "cbs",
    "alexandra instituttet", "teknologisk institut",
]

# Junior/operational roles - classify as POI even at ICP companies
# These roles don't make purchasing decisions
_JUNIOR_ROLES = [
    # Trades
    "tømrer", "snedker", "murer", "maler",
    "elektriker", "electrician",  # Electrician trade
    "vvs", "plumber",  # Plumber trade
    # Site foremen (specific patterns to avoid matching "projektformand")
    "sjakformand", "arbejdsformand", "pladsformand",
    "foreman", "sjakbajs",
    # Trade foremen (compound words)
    "malerformand", "tømrerformand", "murerformand",
    # Apprentices
    "lærling", "apprentice",
    # Installers
    "montør", "installatør",
    # General trades
    "håndværker", "craftsman",
    # Service roles
    "servicemedarbejder", "servicetekniker",
    # Junior engineering
    "konstruktionsingeniør",
    # Additional junior/operational
    "tekniker", "technician",
    "operatør", "operator",
    "chauffør", "driver",
    "lagermedarbejder", "warehouse",
    "rengøring", "cleaning",
    "pedel", "caretaker",
]

# Title patterns that indicate Not ICP (wrong department/role)
_NOT_ICP_TITLE_PATTERNS = [
    "sales", "account executive", "business development",
    "marketing", "kommunikation", "hr ", "human resources",
    "talent", "recruitment", "people", "culture",
    "finance", "accounting", "legal", "lawyer", "advokat",
    "selvstændig", "freelance", "self-employed", "independent consultant",
    # Vendor/SaaS roles (not construction decision-makers)
    "customer success", "account manager", "sales executive",
    # Operational/coordination roles
    "koordinator", "coordinator",
    "sekretær", "secretary", "assistent", "assistant",
    # Supply chain (logistics, not construction)
    "supply chain", "logistik", "logistics", "indkøb",
    # IT support (not decision-makers)
    "it support", "helpdesk", "servicedesk",
    # Software development (typically not construction industry)
    "software engineer", "software developer", "frontend", "backend",
    "full stack", "fullstack", "devops", "sre ", "data engineer",
    "data scientist", "machine learning", "ml engineer",
]

# Generic titles that require a known ICP company to classify as ICP
# Without company context, these should return Review instead
_GENERIC_TITLES = [
    # C-level and leadership (need company context)
    "ceo", "cfo", "coo", "head of", "director", "direktør",
    "formand", "leder", "chef",
    # Project management (could be any industry)
    "project manager", "program manager", "manager",
    "projektleder", "projektchef", "programleder",
    # Construction leadership (still need company for non-construction orgs)
    "site manager", "byggeleder",
    "construction manager",
    "estimator", "kalkulationschef",
    "operations", "driftschef",
    # Consulting/advisory (ICP if company is known)
    "konsulent", "rådgiver", "advisor",
    # Risk/compliance (could be any industry)
    "risk", "compliance",
]

# Construction-specific patterns that indicate ICP regardless of company
# These are specific enough that they don't need company context
_CONSTRUCTION_SPECIFIC_PATTERNS = [
    "qhse", "bygningskonstruktør", "entrepriseleder", "entreprisechef",
    "bygherrerådgiver", "bygherrerådgivning", "anlægsleder", "anlægschef",
    "dgnb", "breeam", "arbejdsmiljøkoordinator", "amk-p", "amk-b",
    "ikt-leder", "ikt projektleder", "lca ansvarlig",
]


def classify_contact(
    name: str,
    title: str,
    company: str | None,
    ref: dict,
) -> tuple[str, str | None, str | None]:
    """Classify a contact. Returns (status, company_tier, match_reason).

    Priority order:
    1. People list (exact name match)
    2. Companies list:
       a. Exact match
       b. Normalized match (strips suffixes like A/S, Danmark)
       c. Alias match
       d. Fuzzy match (80%+ similarity)
    3. Job Titles list (pattern match)
    4. Exclusion rules
    5. Fallback → 'Review'

    match_reason is returned for debugging/transparency (e.g., "fuzzy:85%", "alias", "exact")
    """
    name_lower = (name or "").strip().lower()
    title_lower = (title or "").strip().lower()
    company_lower = (company or "").strip().lower()
    company_normalized = normalize_company_name(company or "")

    # 1. People list
    if name_lower in ref["people"]:
        entry = ref["people"][name_lower]
        return entry["status"], None, "person"

    # 2. Hard exclusions (always exclude regardless of company)
    if any(exc in title_lower for exc in _EXCLUSION_PATTERNS):
        return "Not ICP", None, "excluded"

    # 3. Check if company is a POI-only organization (industry associations, competitors as contacts)
    if company_lower and any(poi in company_lower for poi in _POI_COMPANIES):
        return "POI", None, "poi_company"

    # Helper function to classify based on company entry
    def _classify_by_company(entry, match_type):
        tier = entry["tier"]
        quals = entry["qualifying_titles"]
        tier_str = f"Tier {tier}" if tier else None

        # Check for junior/operational roles - demote to POI even at ICP companies
        if any(jr in title_lower for jr in _JUNIOR_ROLES):
            return "POI", tier_str, f"{match_type}:junior_role"

        # If qualifying titles are specified, check if contact's title matches
        if quals:
            if any(q in title_lower for q in quals if q):
                return "ICP", tier_str, match_type
            else:
                return "POI", tier_str, match_type
        else:
            # No qualifying titles restriction → ICP by default for target company
            return "ICP", tier_str, match_type

    # 2a. Exact company match
    if company_lower and company_lower in ref["companies"]:
        return _classify_by_company(ref["companies"][company_lower], "exact")

    # 2b. Normalized company match (strips A/S, Danmark, etc.)
    companies_normalized = ref.get("companies_normalized", {})
    if company_normalized and company_normalized in companies_normalized:
        return _classify_by_company(companies_normalized[company_normalized], "normalized")

    # 2c. Alias match
    aliases = ref.get("aliases", {})
    canonical_name = aliases.get(company_lower) or aliases.get(company_normalized)
    if canonical_name:
        canonical_lower = canonical_name.lower()
        if canonical_lower in ref["companies"]:
            return _classify_by_company(ref["companies"][canonical_lower], "alias")

    # 2d. Fuzzy company match
    if company_normalized and companies_normalized:
        fuzzy_match, fuzzy_score = fuzzy_match_company(
            company or "", companies_normalized, threshold=0.80
        )
        if fuzzy_match:
            fuzzy_lower = fuzzy_match.lower()
            if fuzzy_lower in ref["companies"]:
                match_type = f"fuzzy:{int(fuzzy_score * 100)}%"
                return _classify_by_company(ref["companies"][fuzzy_lower], match_type)

    # 4. Role and company exclusions (AFTER company matching failed)
    # If we reach here, company is NOT in the reference list

    # 4a. Check if company is a competitor or excluded type (freelancers, etc.)
    if company_lower and any(exc in company_lower for exc in _EXCLUDED_COMPANIES):
        return "Not ICP", None, "excluded_company"

    # 5. Job Titles list - check reference patterns BEFORE exclusion rules
    # This ensures specific roles like "Proceskoordinator" get classified correctly
    for jt in ref["job_titles"]:
        if jt["pattern"] in title_lower:
            # ICP status ALWAYS requires company context
            # Without known company, ICP patterns go to Review
            if jt["status"] == "ICP":
                return "Review", None, f"needs_company:{jt['pattern']}"
            # POI and Not ICP can be returned directly
            return jt["status"], None, f"title:{jt['pattern']}"

    # 6. Not ICP roles (sales, HR, marketing) - fallback if no Job Title match
    # Check if title contains construction-specific patterns first
    # These should go to Review (potential ICP), not auto-exclude
    has_construction_pattern = any(
        cs in title_lower for cs in _CONSTRUCTION_SPECIFIC_PATTERNS
    )
    if not has_construction_pattern:
        if any(pattern in title_lower for pattern in _NOT_ICP_TITLE_PATTERNS):
            return "Not ICP", None, "excluded_role"

    # 7. No match
    return "Review", None, None


def classify_contact_simple(
    name: str,
    title: str,
    company: str | None,
    ref: dict,
) -> tuple[str, str | None]:
    """Simplified classify_contact that returns just (status, company_tier).

    For backwards compatibility with code that doesn't need match_reason.
    """
    status, tier, _ = classify_contact(name, title, company, ref)
    return status, tier


# ---------------------------------------------------------------------------
# Tracker read/write helpers
# ---------------------------------------------------------------------------
def _read_contacts_sheet(ws) -> list[dict]:
    """Read the Contacts sheet into a list of dicts."""
    headers = [cell.value for cell in ws[1]]
    contacts = []
    for row in ws.iter_rows(min_row=2, values_only=True):
        d = dict(zip(headers, row))
        contacts.append(d)
    return contacts


def _read_interactions_sheet(ws) -> list[dict]:
    """Read the All Interactions sheet into a list of dicts."""
    headers = [cell.value for cell in ws[1]]
    interactions = []
    for row in ws.iter_rows(min_row=2, values_only=True):
        d = dict(zip(headers, row))
        interactions.append(d)
    return interactions


# ---------------------------------------------------------------------------
# Parse raw LinkedIn data (paste or CSV-like)
# ---------------------------------------------------------------------------
#
# ===========================================================================
# PROTECTED: COMPANY IDENTIFICATION FLOW - DO NOT MODIFY WITHOUT USER REQUEST
# ===========================================================================
# This flow is intentional and must not be changed unless explicitly requested.
#
# When parsing new LinkedIn data, company identification follows this order:
#
#   1. LOAD CACHES: Load known companies from Contacts sheet and Reference Lists
#
#   2. EXTRACT: Try to extract company from title using patterns:
#      - @ pattern: "Role @ Company"
#      - Preposition: "Role at/hos/for/ved/i Company"
#      - Suffix: "Role CompanyName A/S" (scan for A/S, ApS, Group, etc.)
#      - Company-first: "CompanyName - Role"
#      - Delimiter: "Role | Company" or "Role - Company"
#
#   3. VALIDATE extracted company (in order):
#      a) Blocklist check → Reject if blocked
#      b) Has suffix (A/S, ApS, Group) → Accept (high confidence)
#      c) In tracker/reference cache → Accept (known company)
#      d) Preposition-based extraction → Accept (high confidence)
#      e) CVR validates → Accept with official name
#      f) Brave web search → Accept if found
#      g) None of the above → Leave empty (goes to Review)
#
#   4. FALLBACK: If no extraction at all, try Brave web search for Name + Title
#
#   5. RESULT: Company is either filled or empty (empty → needs manual review)
#
# This ensures:
#   - Known companies like XMANO are matched from cache
#   - Preposition extractions (hos Junkbusters) are accepted directly
#   - Uncertain extractions are validated via CVR then Brave
#   - False positives (Donorbygninger) are rejected and go to Review
# ===========================================================================
#
def parse_raw_data(raw_text: str, minimal_mode: bool = False, source_profile: str = "") -> list[dict]:
    """Parse pasted LinkedIn data into interaction dicts.

    Date is always set to today (when data is pasted), representing when the
    interaction was recorded in the tracker.

    Supported formats:
    1. Executive profile format (auto-detected):
        "Executive profile: XX  Post Hook: YYYY"
        Followed by LinkedIn reactions list

    2. Full mode (default):
        Tab-separated or comma-separated rows with headers:
        Post Hook, Type, Name, Title, Company, Profile URL, Source Profile
        (Date column is ignored if present - today's date is used)

    3. Minimal mode (--minimal flag or auto-detected):
        Simplified tab-separated input without headers:
        Name<tab>Title<tab>Company<tab>PostHook
        or even just:
        Name<tab>Title

        - Type defaults to "Reaction"
        - Company extracted from title if not provided

    Args:
        raw_text: The raw pasted data
        minimal_mode: Force minimal mode parsing
        source_profile: Optional source profile name (executive who posted)
    """
    # Ensure company caches are loaded for matching against existing tracker data
    if not _tracker_companies_cache:
        _load_tracker_companies_cache()
    if not _ref_companies_cache:
        # Load reference lists to populate ref cache (lightweight call if already loaded)
        try:
            load_reference_lists()
        except Exception:
            pass  # Continue without ref cache if file not found

    # Check for "Executive profile" format first (before cleaning)
    # Supports both "Executive profile: XX" and "Executive profile = XX" formats
    raw_lines = raw_text.strip().splitlines()
    first_line_lower = raw_lines[0].lower() if raw_lines else ""

    if first_line_lower.startswith("executive profile"):
        header_line = raw_lines[0]
        extracted_source = ""
        extracted_hook = ""

        # Normalize the header line - replace " = " with ": " for consistent parsing
        normalized_header = header_line.replace(" = ", ": ").replace("=", ":")
        # Also normalize "post hook" variations
        normalized_lower = normalized_header.lower()

        # Extract source profile (between "executive profile:" and "post hook:")
        if "executive profile" in normalized_lower:
            # Find where executive profile value starts
            exec_match = normalized_lower.find("executive profile")
            after_exec_start = exec_match + len("executive profile")
            # Skip any : or whitespace
            while after_exec_start < len(normalized_header) and normalized_header[after_exec_start] in ": \t":
                after_exec_start += 1

            # Find where post hook starts
            post_hook_start = normalized_lower.find("post hook")
            if post_hook_start != -1:
                # Source is between executive profile and post hook
                extracted_source = normalized_header[after_exec_start:post_hook_start].strip()

                # Get post hook value
                hook_value_start = post_hook_start + len("post hook")
                # Skip any : = or whitespace
                while hook_value_start < len(normalized_header) and normalized_header[hook_value_start] in ":= \t":
                    hook_value_start += 1
                extracted_hook = normalized_header[hook_value_start:].strip()
            else:
                # No post hook, just get source
                extracted_source = normalized_header[after_exec_start:].strip()

        # Check if the post hook has the first person's name embedded at the end
        # LinkedIn appends "NameView Name's profile" to the header line
        first_person_name = ""
        _view_in_hook_re = re.search(
            r"\s{2,}(.+?)View\s+.+?['\u2019\u0027\u02BC]s?\s+profile\s*$",
            extracted_hook, re.IGNORECASE
        )
        if _view_in_hook_re:
            first_person_name = _view_in_hook_re.group(1).strip()
            extracted_hook = extracted_hook[:_view_in_hook_re.start()].strip()

        # Remove trailing reaction type from post hook (e.g., "... like" -> "...")
        reaction_types = ["like", "love", "celebrate", "support", "insightful", "funny", "curious"]
        for rt in reaction_types:
            if extracted_hook.lower().endswith(" " + rt):
                extracted_hook = extracted_hook[:-len(rt)-1].strip()
                break

        # Use extracted values or fall back to provided parameters
        final_source = extracted_source or source_profile
        final_hook = extracted_hook

        # Prepend the first person's name if it was embedded in the header
        remaining_text = "\n".join(raw_lines[1:])
        if first_person_name:
            remaining_text = first_person_name + "\n" + remaining_text
        return parse_linkedin_likes(remaining_text, post_hook=final_hook, source_profile=final_source)

    # Clean LinkedIn-specific noise (connection degrees, etc.)
    raw_text = _clean_linkedin_data(raw_text)

    lines = raw_text.strip().splitlines()
    if not lines:
        return []

    # Auto-detect minimal mode: no standard headers found
    header_line = lines[0].lower().replace(" ", "")
    has_headers = any(
        h in header_line
        for h in ["name", "title", "posthook", "type", "company"]
    )

    if not has_headers or minimal_mode:
        # --- Minimal Mode ---
        today = datetime.now().strftime("%Y-%m-%d")
        rows = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Split by tab
            parts = [p.strip() for p in line.split("\t")]

            if len(parts) >= 1:
                name = parts[0]
                title = parts[1] if len(parts) > 1 else ""
                company = parts[2] if len(parts) > 2 else ""
                post_hook = parts[3] if len(parts) > 3 else ""

                # If company not provided, try to extract from title
                if not company and title:
                    company = parse_title_for_company(title) or ""

                # Skip SiteHub employees
                if _is_sitehub_employee(company, title, name):
                    continue

                rows.append({
                    "Date": today,
                    "Post Hook": post_hook,
                    "Type": "Reaction",
                    "Name": name,
                    "Title": title,
                    "Company": company,
                    "Profile URL": "",
                    "Source Profile": source_profile,
                })

        return rows

    # --- Full Mode (with headers) ---
    today = datetime.now().strftime("%Y-%m-%d")
    sep = "\t" if "\t" in lines[0] else ","
    headers_raw = [h.strip().strip('"') for h in lines[0].split(sep)]

    # Map to expected column names (Date is excluded - we always use today)
    col_map = {}
    expected = [
        "Post Hook", "Type", "Name", "Title",
        "Company", "Profile URL", "Source Profile",
    ]
    for i, h in enumerate(headers_raw):
        for e in expected:
            if h.lower().replace(" ", "") == e.lower().replace(" ", ""):
                col_map[e] = i
                break

    rows = []
    for line in lines[1:]:
        vals = [v.strip().strip('"') for v in line.split(sep)]
        row = {"Date": today}  # Always use today's date
        for col_name, idx in col_map.items():
            row[col_name] = vals[idx] if idx < len(vals) else ""

        # Override Source Profile if provided via parameter
        if source_profile:
            row["Source Profile"] = source_profile

        # Skip SiteHub employees
        if _is_sitehub_employee(row.get("Company"), row.get("Title"), row.get("Name")):
            continue

        rows.append(row)

    return rows


def quick_add_entry(name: str, title: str, post_hook: str = "") -> dict:
    """Create a single interaction entry from minimal input.

    Used with --quick flag for rapid single-entry mode.
    """
    today = datetime.now().strftime("%Y-%m-%d")
    company = parse_title_for_company(title) or ""

    return {
        "Date": today,
        "Post Hook": post_hook,
        "Type": "Reaction",
        "Name": name,
        "Title": title,
        "Company": company,
        "Profile URL": "",
        "Source Profile": "",
    }


# ---------------------------------------------------------------------------
# Phase 4: LinkedIn Likes Parser
# ---------------------------------------------------------------------------
def parse_linkedin_likes(raw_text: str, post_hook: str = "", source_profile: str = "") -> list[dict]:
    """Parse raw LinkedIn "likers" list with alternating Name/Title lines.

    LinkedIn's reaction list typically copies as:
        John Doe
        CEO at Acme Inc
        Jane Smith
        Director | BigCorp
        ...

    This function detects and handles this alternating pattern.

    Args:
        raw_text: The pasted text from LinkedIn likes list
        post_hook: Optional post hook to associate with all entries
        source_profile: Optional source profile name (executive who posted)

    Returns:
        List of interaction dicts ready for tracker import
    """
    # Clean LinkedIn-specific noise (connection degrees, etc.)
    raw_text = _clean_linkedin_data(raw_text)

    lines = [line.strip() for line in raw_text.strip().splitlines() if line.strip()]

    if not lines:
        return []

    today = datetime.now().strftime("%Y-%m-%d")
    rows = []

    # Detect if this looks like alternating Name/Title format
    # After cleaning, data should be clean Name/Title pairs
    def _looks_like_name(text: str) -> bool:
        # Names are typically 2-5 words, no special title characters
        words = text.split()
        if len(words) < 1 or len(words) > 6:
            return False
        # Names don't usually contain these (but titles do)
        if any(c in text for c in ["|", "@", "–", "—"]):
            return False
        # Organization/company patterns (not person names)
        lower = text.lower()
        org_indicators = ["videnscenter", "lab", "center", "centre", "institut",
                          "forening", "forbund", "group", "consulting", "solutions",
                          "technologies", "services", "partners", "associates"]
        # If contains " - " and an org indicator, it's likely a company page
        if " - " in text and any(ind in lower for ind in org_indicators):
            return False
        # Names don't usually start with these patterns
        title_starts = ["ceo", "cfo", "cto", "coo", "ciso", "vp ", "senior ", "junior ",
                        "head of", "director", "manager", "chef", "leder", "direktør",
                        "partner", "founder", "investor", "professor", "dr.", "adm."]
        if any(lower.startswith(t) for t in title_starts):
            return False
        return True

    def _looks_like_title(text: str) -> bool:
        # Titles often contain these indicators
        indicators = [
            " at ", " hos ", " i ", " @ ", "|", "–", "—", ",",
            "manager", "director", "engineer", "ceo", "cfo", "cto",
            "chef", "leder", "konsulent", "rådgiver", "partner",
            "founder", "owner", "specialist", "analyst", "developer",
            "lead", "head", "senior", "projektleder", "direktør",
        ]
        lower = text.lower()
        return any(ind in lower for ind in indicators)

    # After cleaning LinkedIn data, we should have clean alternating Name/Title pairs.
    # Use strict alternating mode: odd indices (0,2,4...) are names, even indices (1,3,5...) are titles
    # Only skip if the "name" clearly looks like a title
    i = 0
    while i < len(lines):
        potential_name = lines[i]

        # Skip if this looks like a title (starting with title keywords)
        lower_name = potential_name.lower()
        if any(lower_name.startswith(t) for t in ["ceo", "cfo", "director", "manager", "senior ", "head of"]):
            i += 1
            continue

        # Check if there's a next line (the title)
        if i + 1 < len(lines):
            potential_title = lines[i + 1]

            # In strict alternating mode, accept the pair if name looks reasonable
            name_valid = _looks_like_name(potential_name)

            if name_valid:
                name = potential_name
                title = potential_title
                # Extract company from title and validate
                extracted = parse_title_for_company(title)
                company = ""
                if extracted:
                    extracted_lower = extracted.lower().strip()
                    extracted_norm = normalize_company_name(extracted)

                    # Check blocklist first - reject known bad extractions
                    if extracted_lower in _company_blocklist or extracted_norm in _company_blocklist:
                        pass  # Reject - known bad extraction

                    # High confidence: has company suffix (A/S, ApS, Group, etc.)
                    elif _has_company_suffix(extracted):
                        company = extracted

                    # High confidence: already known in tracker or reference lists
                    elif extracted_lower in _tracker_companies_cache or extracted_norm in _tracker_companies_cache:
                        company = extracted
                    elif extracted_lower in _ref_companies_cache or extracted_norm in _ref_companies_norm_cache:
                        company = extracted

                    # High confidence: extracted via preposition pattern (at/hos/for/ved/i)
                    elif _is_preposition_based_extraction(title, extracted):
                        company = extracted

                    # Low confidence: validate via CVR before accepting
                    else:
                        validated = cvr_fuzzy_validate(extracted, threshold=0.70)
                        if validated:
                            company = validated
                        # If CVR doesn't validate, try Brave web search
                        else:
                            web_company = brave_lookup_company(name, title, verbose=False)
                            if web_company:
                                company = web_company

                # If no extraction at all, try Brave web search as fallback
                if not company and name and title:
                    web_company = brave_lookup_company(name, title, verbose=False)
                    if web_company:
                        company = web_company

                # Skip SiteHub employees
                if _is_sitehub_employee(company, title, name):
                    i += 2
                    continue

                rows.append({
                    "Date": today,
                    "Post Hook": post_hook,
                    "Type": "Reaction",
                    "Name": name,
                    "Title": title,
                    "Company": company,
                    "Profile URL": "",
                    "Source Profile": source_profile,
                })
                i += 2  # Move past both name and title
                continue

        # If pattern doesn't match, try treating current line as name-only
        # (fallback for edge cases)
        if _looks_like_name(potential_name) and not _looks_like_title(potential_name):
            rows.append({
                "Date": today,
                "Post Hook": post_hook,
                "Type": "Reaction",
                "Name": potential_name,
                "Title": "",
                "Company": "",
                "Profile URL": "",
                "Source Profile": source_profile,
            })

        i += 1

    return rows


# ---------------------------------------------------------------------------
# Parsed Data Sanity Check — detect format changes before writing to DB
# ---------------------------------------------------------------------------
# Patterns that should never appear in contact names after proper parsing
_GARBLED_NAME_PATTERNS = re.compile(
    r"degree connection|"        # Connection degree leaked into name
    r"View\s+.+profile|"        # LinkedIn accessibility text in name
    r"^\s*\*\s|"                 # LinkedIn list marker prefix
    r"^\d+(?:st|nd|rd|th)\s|"   # Starts with "1st", "2nd", etc.
    r"\d+\s*followers?\s*$",    # Follower count as name
    re.IGNORECASE,
)


def validate_parsed_interactions(rows: list[dict], abort_threshold: float = 0.15) -> list[dict]:
    """Sanity-check parsed interactions for signs of a broken/changed LinkedIn format.

    Scans Name and Title fields for patterns that indicate the parser didn't
    handle the input correctly (e.g., "1st degree connection" as a name,
    "* NameView Name's profile" as a name).

    Args:
        rows: Parsed interaction dicts from parse_raw_data / parse_linkedin_likes
        abort_threshold: Fraction of rows with issues that triggers an abort (default 15%)

    Returns:
        The rows unchanged if validation passes.

    Raises:
        ValueError: If too many rows look garbled, with details about what went wrong.
    """
    if not rows:
        return rows

    garbled = []
    for i, row in enumerate(rows):
        name = row.get("Name", "")
        problems = []
        if _GARBLED_NAME_PATTERNS.search(name):
            problems.append(f"name={name!r}")
        title = row.get("Title", "")
        if _GARBLED_NAME_PATTERNS.search(title) and not title.strip().startswith("View"):
            # Titles can legitimately contain some keywords, only flag if name-like
            if re.match(r"^\d+(?:st|nd|rd|th)\s+degree", title, re.IGNORECASE):
                problems.append(f"title={title!r}")
        if problems:
            garbled.append((i, problems))

    if not garbled:
        return rows

    ratio = len(garbled) / len(rows)
    # Always warn
    print(f"\n  WARNING: {len(garbled)}/{len(rows)} parsed rows ({ratio:.0%}) have suspicious names/titles:")
    for idx, problems in garbled[:5]:
        print(f"    Row {idx}: {', '.join(problems)}")
    if len(garbled) > 5:
        print(f"    ... and {len(garbled) - 5} more")

    if ratio >= abort_threshold:
        raise ValueError(
            f"ABORTING: {len(garbled)}/{len(rows)} ({ratio:.0%}) of parsed interactions "
            f"look garbled — LinkedIn may have changed their copy-paste format. "
            f"No data was written to the database. "
            f"Review the input and update the parser if needed."
        )

    return rows


# ---------------------------------------------------------------------------
# Phase 3: Auto-Learn Suggestions
# ---------------------------------------------------------------------------
def _analyze_suggestions(
    contacts: list[dict],
    ref: dict,
    min_occurrences: int = 3,
    verbose: bool = True,
) -> dict:
    """Analyze contacts to suggest additions to reference lists.

    Surfaces:
    - Companies that appear 3+ times among ICP contacts but aren't in reference
    - Title patterns that appear 3+ times among ICP contacts but aren't covered

    Returns dict with suggested_companies and suggested_patterns lists.
    """
    # Get ICP contacts only
    icp_contacts = [c for c in contacts if c.get("Status") == "ICP"]

    # Existing reference data
    existing_companies = set(ref["companies"].keys())
    existing_companies_normalized = set(ref.get("companies_normalized", {}).keys())
    existing_aliases = set(ref.get("aliases", {}).keys())
    existing_title_patterns = {jt["pattern"] for jt in ref["job_titles"]}

    # 1. Company frequency analysis among ICP contacts
    company_freq: dict[str, int] = {}
    for c in icp_contacts:
        comp = (c.get("Company") or "").strip()
        if comp:
            company_freq[comp] = company_freq.get(comp, 0) + 1

    # Find companies that appear frequently but aren't in reference
    suggested_companies = []
    for comp, count in sorted(company_freq.items(), key=lambda x: -x[1]):
        if count < min_occurrences:
            continue
        comp_lower = comp.lower()
        comp_normalized = normalize_company_name(comp)
        # Check if already in reference (any form)
        if (comp_lower not in existing_companies and
            comp_normalized not in existing_companies_normalized and
            comp_lower not in existing_aliases and
            comp_normalized not in existing_aliases):
            suggested_companies.append((comp, count))

    # 2. Title keyword analysis among ICP contacts
    stopwords = {
        "and", "or", "the", "a", "an", "of", "in", "at", "to", "for", "with",
        "og", "i", "hos", "til", "med", "på", "ved", "af", "|", "-", "–", "—",
        "&", "/", ".", ",", "(", ")", "[", "]",
    }
    word_freq: dict[str, int] = {}
    for c in icp_contacts:
        title = (c.get("Title") or "").lower()
        words = re.findall(r"[a-zæøå]+", title)
        for w in words:
            if len(w) >= 3 and w not in stopwords:
                word_freq[w] = word_freq.get(w, 0) + 1

    # Find patterns that appear frequently but aren't covered
    suggested_patterns = []
    for word, count in sorted(word_freq.items(), key=lambda x: -x[1]):
        if count < min_occurrences:
            continue
        # Check if word is already covered by existing patterns
        covered = any(word in pat or pat in word for pat in existing_title_patterns)
        if not covered:
            suggested_patterns.append((word, count))

    # Print suggestions if verbose
    if verbose and (suggested_companies or suggested_patterns):
        print("\n=== Suggested Reference List Additions ===")

        if suggested_companies:
            print(f"\nCompanies ({min_occurrences}+ ICP contacts, not in reference):")
            for comp, count in suggested_companies[:10]:
                print(f"  {count:3d}x  {comp}")
            if len(suggested_companies) > 10:
                print(f"  ... and {len(suggested_companies) - 10} more")

        if suggested_patterns:
            print(f"\nTitle patterns ({min_occurrences}+ ICP contacts, not covered):")
            for pattern, count in suggested_patterns[:10]:
                print(f"  {count:3d}x  {pattern}")
            if len(suggested_patterns) > 10:
                print(f"  ... and {len(suggested_patterns) - 10} more")

        print()

    return {
        "suggested_companies": suggested_companies,
        "suggested_patterns": suggested_patterns,
    }


# ---------------------------------------------------------------------------
# Update tracker workbook
# ---------------------------------------------------------------------------
def backfill_source_profile(
    tracker_path: str | Path | None = None,
    source_name: str = "",
    verbose: bool = True,
) -> int:
    """Set Source Profile for all interactions that have an empty value.

    Args:
        tracker_path: Path to the tracker workbook.
        source_name: The value to set for empty Source Profile cells.
        verbose: Print progress information.

    Returns:
        Number of rows updated.
    """
    if verbose:
        print("=== Backfill Source Profile ===")

    if _USE_DB:
        from icp_db import Interaction, resolve_source_profile
        session = _get_db_session()
        source_id = resolve_source_profile(session, source_name)
        rows = session.query(Interaction).filter(
            (Interaction.source_profile_id.is_(None)) &
            ((Interaction.source_profile_raw.is_(None)) | (Interaction.source_profile_raw == ""))
        ).all()
        updated_count = 0
        for row in rows:
            row.source_profile_raw = source_name
            if source_id:
                row.source_profile_id = source_id
            updated_count += 1
        session.commit()
        if verbose:
            print(f'  Updated {updated_count} interactions with Source Profile = "{source_name}"')
        return updated_count

    tracker_path = Path(tracker_path or TRACKER_PATH)

    wb = openpyxl.load_workbook(tracker_path)
    ws = wb["All Interactions"]

    # Find headers and Source Profile column index
    headers = [cell.value for cell in ws[1]]
    try:
        source_col_idx = headers.index("Source Profile") + 1  # 1-based for openpyxl
    except ValueError:
        print("  Error: 'Source Profile' column not found in All Interactions sheet")
        return 0

    # Loop through rows (starting from row 2 to skip header)
    updated_count = 0
    for row_num in range(2, ws.max_row + 1):
        cell = ws.cell(row=row_num, column=source_col_idx)
        current_value = cell.value
        # Update if empty (None or empty string)
        if not current_value or (isinstance(current_value, str) and not current_value.strip()):
            cell.value = source_name
            updated_count += 1

    # Save workbook
    wb.save(tracker_path)

    if verbose:
        print(f'  Updated {updated_count} interactions with Source Profile = "{source_name}"')

    return updated_count


def _process_contacts_from_interactions(
    all_interactions: list[dict],
    existing_contacts: list[dict],
    ref: dict,
    use_web: bool = False,
    use_cvr: bool = False,
    verbose: bool = True,
    new_interactions_count: int = 0,
) -> tuple[list[dict], set[str], dict, dict, int, int, int, int, dict]:
    """Shared processing logic: build contacts from interactions, enrich, classify.

    Used by both Excel and DB branches of update_tracker().
    Returns (contacts_list, new_contact_keys, status_counts, new_status_counts,
             stage_a, stage_b, stage_c, validated, suggestions).
    """
    existing_status_map = {
        (c.get("Name") or "").strip().lower(): c.get("Status", "")
        for c in existing_contacts
    }
    existing_notes_map = {
        (c.get("Name") or "").strip().lower(): c.get("Notes", "")
        for c in existing_contacts
    }
    existing_company_map = {
        (c.get("Name") or "").strip().lower(): c.get("Company", "")
        for c in existing_contacts
    }

    # --- Build / refresh Contacts from All Interactions ---
    contacts_map: dict[str, dict] = {}
    new_contact_keys: set[str] = set()
    sitehub_skipped = 0
    for inter in all_interactions:
        name = (inter.get("Name") or "").strip()
        if not name:
            continue

        if _is_sitehub_employee(inter.get("Company"), inter.get("Title"), name):
            sitehub_skipped += 1
            continue

        key = name.lower()
        if key not in contacts_map:
            existing_status = existing_status_map.get(key, "")
            preserve_status = existing_status if existing_status and existing_status != "Review" else ""
            existing_company = existing_company_map.get(key, "")
            if key not in existing_status_map:
                new_contact_keys.add(key)
            contacts_map[key] = {
                "Name": name,
                "Title": inter.get("Title") or "",
                "Company": existing_company or inter.get("Company") or "",
                "Status": preserve_status,
                "Company Tier": "",
                "Total Interactions": 0,
                "First Interaction": inter.get("Date"),
                "Latest Interaction": inter.get("Date"),
                "Notes": existing_notes_map.get(key, ""),
            }
        c = contacts_map[key]
        c["Total Interactions"] += 1
        if inter.get("Title") and not c["Title"]:
            c["Title"] = inter["Title"]
        if inter.get("Company") and not c["Company"]:
            c["Company"] = inter["Company"]
        d = inter.get("Date")
        if d:
            if isinstance(d, datetime):
                d = d.date()
            first = c["First Interaction"]
            if isinstance(first, datetime):
                first = first.date()
            latest = c["Latest Interaction"]
            if isinstance(latest, datetime):
                latest = latest.date()
            if first is None or d < first:
                c["First Interaction"] = d
            if latest is None or d > latest:
                c["Latest Interaction"] = d

    contacts_list = list(contacts_map.values())

    if verbose and sitehub_skipped:
        print(f"  Filtered out {sitehub_skipped} SiteHub employee interactions")

    # --- Enrich companies ---
    if verbose:
        print("Enriching companies...")
    stage_a, stage_b, stage_c, validated = enrich_missing_companies(
        contacts_list, ref=ref, use_web=use_web, use_cvr=use_cvr, verbose=verbose,
        new_contacts_only=new_contact_keys,
    )

    # --- Classify contacts ---
    if verbose:
        print("Classifying contacts...")
    status_counts = {"ICP": 0, "POI": 0, "Not ICP": 0, "Review": 0}
    auto_classified = 0
    preserved = 0
    for c in contacts_list:
        if c["Status"]:
            preserved += 1
            status_counts[c["Status"]] = status_counts.get(c["Status"], 0) + 1
        else:
            status, tier, match_reason = classify_contact(
                c["Name"], c["Title"], c["Company"], ref
            )
            if status == "Review":
                learned_status, learned_reason, confidence = get_learned_suggestion(
                    c["Title"], c["Company"]
                )
                if learned_status and confidence >= 0.6:
                    status = learned_status
                    match_reason = f"learned:{learned_reason}"
            c["Status"] = status
            c["Company Tier"] = tier or ""
            c["Match Reason"] = match_reason or ""
            status_counts[status] = status_counts.get(status, 0) + 1
            auto_classified += 1

    if verbose and preserved:
        print(f"  Preserved {preserved} manual classifications, auto-classified {auto_classified}")

    # --- Count NEW contacts by status ---
    new_status_counts = {"ICP": 0, "POI": 0, "Not ICP": 0, "Review": 0}
    for c in contacts_list:
        key = (c.get("Name") or "").strip().lower()
        if key in new_contact_keys:
            status = c.get("Status", "Review")
            new_status_counts[status] = new_status_counts.get(status, 0) + 1

    # --- Auto-learn suggestions ---
    suggestions = _analyze_suggestions(contacts_list, ref, min_occurrences=3, verbose=verbose)

    return (
        contacts_list, new_contact_keys, status_counts, new_status_counts,
        stage_a, stage_b, stage_c, validated, suggestions,
    )


def _print_batch_summary(
    new_count: int,
    new_contact_keys: set[str],
    new_status_counts: dict,
    contacts_list: list[dict],
    verbose: bool = True,
):
    """Print the batch summary (shared by Excel and DB paths)."""
    if not verbose:
        return
    print("\n" + "=" * 50)
    print("BATCH SUMMARY")
    print("=" * 50)
    print(f"  Interactions:      {new_count}")
    print(f"  New contacts:      {len(new_contact_keys)}")
    print()
    print("  New contacts by status:")
    print(f"    ICP:      {new_status_counts.get('ICP', 0)}")
    print(f"    POI:      {new_status_counts.get('POI', 0)}")
    print(f"    Not ICP:  {new_status_counts.get('Not ICP', 0)}")
    print(f"    Review:   {new_status_counts.get('Review', 0)}")
    print()
    print(f"  Total contacts in tracker: {len(contacts_list)}")
    print("=" * 50)

    if new_status_counts.get("Review", 0) > 0:
        print(f"\n*** {new_status_counts['Review']} NEW CONTACTS NEED REVIEW ***")
        _show_review_queue(contacts_list, limit=10)
        print("\nRun: python icp_process.py --review" + (" --use-db" if _USE_DB else ""))


def update_tracker(
    tracker_path: str | Path | None = None,
    reference_path: str | Path | None = None,
    new_interactions: list[dict] | None = None,
    use_web: bool = False,
    use_cvr: bool = False,
    verbose: bool = True,
) -> dict:
    """Main update routine.

    If new_interactions is provided, appends them to All Interactions.
    Then enriches companies, re-classifies, updates Contacts sheet and Dashboard.

    Args:
        use_cvr: Enable CVR API lookup for uncertain company candidates (50/day free)

    Returns a summary dict.
    """
    # ===================== DB MODE =====================
    if _USE_DB:
        import icp_db
        session = _get_db_session()

        if verbose:
            print("Loading tracker: PostgreSQL database")

        # --- Load reference lists ---
        if verbose:
            print("Loading references: PostgreSQL database")
        ref = load_reference_lists()

        # --- Append new interactions to DB ---
        new_count = 0
        if new_interactions:
            new_count = icp_db.append_interactions(session, new_interactions)
            if verbose:
                print(f"  Appended {new_count} new interactions")

        # Re-read all from DB
        all_interactions = icp_db.read_interactions(session)
        existing_contacts = icp_db.read_contacts(session)

        # --- Shared processing ---
        (
            contacts_list, new_contact_keys, status_counts, new_status_counts,
            stage_a, stage_b, stage_c, validated, suggestions,
        ) = _process_contacts_from_interactions(
            all_interactions, existing_contacts, ref,
            use_web=use_web, use_cvr=use_cvr, verbose=verbose,
            new_interactions_count=new_count,
        )

        # --- Back-propagate enriched companies to interactions in DB ---
        name_to_company = {
            c["Name"].lower(): c["Company"]
            for c in contacts_list
            if c.get("Company")
        }
        interactions_enriched = icp_db.backfill_interaction_companies(session, name_to_company)
        if verbose and interactions_enriched:
            print(f"  Back-filled company on {interactions_enriched} interaction rows")

        # --- Write contacts to DB ---
        if verbose:
            print("Saving contacts to database...")
        icp_db.upsert_contacts(session, contacts_list)

        # --- Link interactions to contacts ---
        icp_db.link_interactions_to_contacts(session)

        # Dashboard and ICP sheet are Excel-only (skipped in DB mode)
        if verbose:
            print("  (Dashboard/ICP sheet updates skipped in DB mode — use web UI)")

        summary = {
            "total_contacts": len(contacts_list),
            "new_interactions": new_count,
            "new_contacts": len(new_contact_keys),
            "new_ICP": new_status_counts.get("ICP", 0),
            "new_POI": new_status_counts.get("POI", 0),
            "new_Not_ICP": new_status_counts.get("Not ICP", 0),
            "new_Review": new_status_counts.get("Review", 0),
            "companies_enriched_a": stage_a,
            "companies_enriched_b": stage_b,
            "companies_enriched_c": stage_c,
            "companies_validated": validated,
            "interactions_backfilled": interactions_enriched,
            "suggestions": suggestions,
            **status_counts,
        }

        # --- Zoho CRM sync for new ICP contacts ---
        if new_status_counts.get("ICP", 0) > 0:
            try:
                from zoho_client import is_configured
                if is_configured():
                    from zoho_sync import sync_all_unsynced
                    zoho_result = sync_all_unsynced(session)
                    if verbose and (zoho_result["synced"] > 0 or zoho_result["errors"] > 0):
                        print(f"  Zoho CRM: {zoho_result['synced']} synced, {zoho_result['errors']} errors")
            except Exception as e:
                if verbose:
                    print(f"  Zoho CRM sync skipped: {e}")

        _print_batch_summary(new_count, new_contact_keys, new_status_counts, contacts_list, verbose)
        return summary

    # ===================== EXCEL MODE (original) =====================
    tracker_path = Path(tracker_path or TRACKER_PATH)
    reference_path = Path(reference_path or REFERENCE_PATH)

    if verbose:
        print(f"Loading tracker: {tracker_path.name}")
    wb = openpyxl.load_workbook(tracker_path)

    # --- Load reference lists ---
    if verbose:
        print(f"Loading references: {reference_path.name}")
    ref = load_reference_lists(reference_path)

    # --- All Interactions sheet ---
    ws_int = wb["All Interactions"]
    int_headers = [cell.value for cell in ws_int[1]]

    # Append new interactions if provided
    new_count = 0
    if new_interactions:
        for row_data in new_interactions:
            new_row = []
            for h in int_headers:
                val = row_data.get(h, "")
                if h == "Date" and isinstance(val, str) and val:
                    try:
                        val = datetime.strptime(val, "%Y-%m-%d").date()
                    except ValueError:
                        pass
                new_row.append(val if val else None)
            ws_int.append(new_row)
            new_count += 1
        if verbose:
            print(f"  Appended {new_count} new interactions")

    # Re-read all interactions
    all_interactions = _read_interactions_sheet(ws_int)

    # --- Load existing Contacts ---
    ws_con = wb["Contacts"]
    existing_contacts = _read_contacts_sheet(ws_con)

    # --- Shared processing ---
    (
        contacts_list, new_contact_keys, status_counts, new_status_counts,
        stage_a, stage_b, stage_c, validated, suggestions,
    ) = _process_contacts_from_interactions(
        all_interactions, existing_contacts, ref,
        use_web=use_web, use_cvr=use_cvr, verbose=verbose,
        new_interactions_count=new_count,
    )

    # Also back-propagate enriched companies to All Interactions (Excel)
    name_to_company = {
        c["Name"].lower(): c["Company"]
        for c in contacts_list
        if c.get("Company")
    }
    interactions_enriched = 0
    for row_idx in range(2, ws_int.max_row + 1):
        name_cell = ws_int.cell(row=row_idx, column=int_headers.index("Name") + 1).value
        company_cell = ws_int.cell(row=row_idx, column=int_headers.index("Company") + 1).value
        if name_cell and not company_cell:
            enriched = name_to_company.get((name_cell or "").strip().lower())
            if enriched:
                ws_int.cell(
                    row=row_idx,
                    column=int_headers.index("Company") + 1,
                    value=enriched,
                )
                interactions_enriched += 1

    if verbose and interactions_enriched:
        print(f"  Back-filled company on {interactions_enriched} interaction rows")

    # --- Write Contacts sheet ---
    ws_con = wb["Contacts"]
    con_headers = [cell.value for cell in ws_con[1]]
    # Clear existing data rows
    for row_idx in range(ws_con.max_row, 1, -1):
        ws_con.delete_rows(row_idx)

    # Sort: ICP first, then POI, then Review, then Not ICP
    status_order = {"ICP": 0, "POI": 1, "Review": 2, "Not ICP": 3}
    contacts_list.sort(
        key=lambda c: (status_order.get(c["Status"], 9), -(c["Total Interactions"] or 0))
    )

    for c in contacts_list:
        row_vals = []
        for h in con_headers:
            row_vals.append(c.get(h, ""))
        ws_con.append(row_vals)

    # --- Update Dashboard ---
    _update_dashboard(wb, contacts_list, all_interactions, status_counts, verbose)

    # --- Update ICP Contacts sheet ---
    _update_icp_sheet(wb, contacts_list, all_interactions, verbose)

    # --- Save ---
    if verbose:
        print(f"Saving {tracker_path.name}...")
    wb.save(tracker_path)
    wb.close()

    summary = {
        "total_contacts": len(contacts_list),
        "new_interactions": new_count,
        "new_contacts": len(new_contact_keys),
        "new_ICP": new_status_counts.get("ICP", 0),
        "new_POI": new_status_counts.get("POI", 0),
        "new_Not_ICP": new_status_counts.get("Not ICP", 0),
        "new_Review": new_status_counts.get("Review", 0),
        "companies_enriched_a": stage_a,
        "companies_enriched_b": stage_b,
        "companies_enriched_c": stage_c,
        "companies_validated": validated,
        "interactions_backfilled": interactions_enriched,
        "suggestions": suggestions,
        **status_counts,
    }

    _print_batch_summary(new_count, new_contact_keys, new_status_counts, contacts_list, verbose)
    return summary


# ---------------------------------------------------------------------------
# Dashboard updater
# ---------------------------------------------------------------------------
def _update_dashboard(wb, contacts, interactions, status_counts, verbose):
    """Rebuild the Dashboard sheet with comprehensive KPIs, trend indicators, and charts.

    Dashboard sections:
    1. KPI Summary with trend indicators (this month vs last month)
    2. Monthly Top Performer callout
    3. Executive Leaderboard (rank, ICP this month, trend %, posts, new ICPs)
    4. Posting Frequency table
    5. Executive Performance Bar Chart
    6. Monthly Trend Line Chart
    7. Status Distribution
    8. Recent Activity
    9. Top ICP Engagers
    10. Top Companies
    11. Top Posts
    """
    if "Dashboard" not in wb.sheetnames:
        return

    ws = wb["Dashboard"]

    # Clear existing content to avoid orphaned data
    # First unmerge any merged cells
    merged_ranges = list(ws.merged_cells.ranges)
    for merged_range in merged_ranges:
        ws.unmerge_cells(str(merged_range))

    # Remove existing charts to avoid duplicates
    ws._charts = []

    # Now clear all cells
    for row in ws.iter_rows():
        for cell in row:
            try:
                cell.value = None
                cell.font = Font()
                cell.fill = PatternFill()
                cell.alignment = Alignment()
            except AttributeError:
                pass  # Skip if cell can't be modified

    # --- Define styles ---
    title_font = Font(name='Calibri', size=18, bold=True, color='1F4E79')
    section_font = Font(name='Calibri', size=13, bold=True, color='FFFFFF')
    section_fill = PatternFill(start_color='2E75B6', end_color='2E75B6', fill_type='solid')
    header_font = Font(name='Calibri', size=11, bold=True, color='1F4E79')
    header_fill = PatternFill(start_color='D6DCE5', end_color='D6DCE5', fill_type='solid')
    label_font = Font(name='Calibri', size=11)
    value_font = Font(name='Calibri', size=11, bold=True)
    number_font = Font(name='Calibri', size=11)
    # Styles for winner callout
    winner_header_font = Font(name='Calibri', size=14, bold=True, color='FFFFFF')
    winner_fill = PatternFill(start_color='C65911', end_color='C65911', fill_type='solid')  # Orange
    winner_name_font = Font(name='Calibri', size=16, bold=True, color='1F4E79')
    winner_detail_font = Font(name='Calibri', size=11, color='333333')
    # Trend colors
    trend_up_font = Font(name='Calibri', size=11, bold=True, color='2E7D32')  # Green
    trend_down_font = Font(name='Calibri', size=11, bold=True, color='C62828')  # Red
    trend_neutral_font = Font(name='Calibri', size=11, bold=True, color='666666')  # Gray

    # --- Calculate all metrics ---
    total = len(contacts)
    icp = status_counts.get("ICP", 0)
    poi = status_counts.get("POI", 0)
    not_icp = status_counts.get("Not ICP", 0)
    review = status_counts.get("Review", 0)
    with_company = sum(1 for c in contacts if c.get("Company"))

    # Build lookup: name -> status and name -> contact
    contact_status = {c["Name"].lower(): c["Status"] for c in contacts}
    contact_lookup = {c["Name"].lower(): c for c in contacts}

    # Filter interactions to ICP contacts only
    icp_interactions = [
        inter for inter in interactions
        if contact_status.get((inter.get("Name") or "").strip().lower()) == "ICP"
    ]
    total_icp_int = len(icp_interactions)
    total_all_int = len(interactions)

    # ICP contacts list
    icp_contacts = [c for c in contacts if c.get("Status") == "ICP"]

    # Engagement rate (avg interactions per ICP contact)
    avg_engagement = total_icp_int / icp if icp > 0 else 0

    # --- Calendar month calculations ---
    today = datetime.now().date()
    current_month_start = today.replace(day=1)
    # Previous month
    if current_month_start.month == 1:
        prev_month_start = current_month_start.replace(year=current_month_start.year - 1, month=12)
    else:
        prev_month_start = current_month_start.replace(month=current_month_start.month - 1)
    prev_month_end = current_month_start - __import__('datetime').timedelta(days=1)

    # Helper to parse date
    def parse_date(date_val):
        if date_val is None:
            return None
        if isinstance(date_val, str):
            try:
                return datetime.strptime(date_val, "%Y-%m-%d").date()
            except ValueError:
                return None
        elif isinstance(date_val, datetime):
            return date_val.date()
        return date_val if hasattr(date_val, 'year') else None

    # --- Per-executive monthly metrics ---
    exec_stats: dict[str, dict] = {}
    monthly_icp_counts: dict[str, int] = {}  # "YYYY-MM" -> count for trend line

    for inter in icp_interactions:
        source = (inter.get("Source Profile") or "").strip()
        if not source:
            continue

        date_val = parse_date(inter.get("Date"))
        if not date_val:
            continue

        # Initialize exec stats
        if source not in exec_stats:
            exec_stats[source] = {
                "icp_this_month": 0,
                "icp_last_month": 0,
                "posts_this_month": set(),
                "posts_all": set(),
                "last_post_date": None,
                "new_icps_this_month": 0,
            }

        # Track monthly ICP counts for trend line
        month_key = date_val.strftime("%Y-%m")
        monthly_icp_counts[month_key] = monthly_icp_counts.get(month_key, 0) + 1

        # Current month
        if date_val >= current_month_start:
            exec_stats[source]["icp_this_month"] += 1
            post_hook = (inter.get("Post Hook") or "").strip()
            if post_hook:
                exec_stats[source]["posts_this_month"].add(post_hook)

        # Previous month
        if prev_month_start <= date_val <= prev_month_end:
            exec_stats[source]["icp_last_month"] += 1

        # Track all posts and last post date
        post_hook = (inter.get("Post Hook") or "").strip()
        if post_hook:
            exec_stats[source]["posts_all"].add(post_hook)
            if exec_stats[source]["last_post_date"] is None or date_val > exec_stats[source]["last_post_date"]:
                exec_stats[source]["last_post_date"] = date_val

    # Calculate new ICP contacts this month per executive
    for contact in icp_contacts:
        first_interaction = parse_date(contact.get("First Interaction"))
        if first_interaction and first_interaction >= current_month_start:
            # Find which executive had the first interaction with this contact
            contact_name = contact["Name"].lower()
            for inter in icp_interactions:
                inter_name = (inter.get("Name") or "").strip().lower()
                inter_date = parse_date(inter.get("Date"))
                if inter_name == contact_name and inter_date == first_interaction:
                    source = (inter.get("Source Profile") or "").strip()
                    if source and source in exec_stats:
                        exec_stats[source]["new_icps_this_month"] += 1
                    break

    # Calculate overall ICP trends
    icp_int_this_month = sum(s["icp_this_month"] for s in exec_stats.values())
    icp_int_last_month = sum(s["icp_last_month"] for s in exec_stats.values())

    # Recent activity (last 7 and 30 days)
    int_last_7 = 0
    int_last_30 = 0
    contacts_last_7 = set()
    contacts_last_30 = set()

    for inter in interactions:
        date_val = parse_date(inter.get("Date"))
        if date_val:
            days_ago = (today - date_val).days
            name = (inter.get("Name") or "").strip().lower()

            if days_ago <= 7:
                int_last_7 += 1
                if contact_status.get(name) == "ICP":
                    contacts_last_7.add(name)
            if days_ago <= 30:
                int_last_30 += 1
                if contact_status.get(name) == "ICP":
                    contacts_last_30.add(name)

    # Top posts by reaction count
    post_counts: dict[str, int] = {}
    for inter in interactions:
        hook = (inter.get("Post Hook") or "").strip()
        if hook:
            post_counts[hook] = post_counts.get(hook, 0) + 1
    top_posts = sorted(post_counts.items(), key=lambda x: -x[1])[:5]

    # Top companies by ICP engagement
    company_counts: dict[str, int] = {}
    for inter in icp_interactions:
        comp = (inter.get("Company") or "").strip()
        if comp:
            company_counts[comp] = company_counts.get(comp, 0) + 1
    top_companies = sorted(company_counts.items(), key=lambda x: -x[1])[:10]

    # Top ICP engagers
    sorted_icp_contacts = sorted(
        icp_contacts,
        key=lambda c: -(c.get("Total Interactions") or 0),
    )[:10]

    # --- Helper functions for styling ---
    def write_section_header(ws, row, text, col_span=4):
        """Write a section header with blue background."""
        cell = ws.cell(row=row, column=1, value=text)
        cell.font = section_font
        cell.fill = section_fill
        cell.alignment = Alignment(vertical='center')
        # Apply fill to spanned columns
        for c in range(2, col_span + 1):
            ws.cell(row=row, column=c).fill = section_fill
        return row + 1

    def write_table_header(ws, row, headers):
        """Write table column headers with gray background."""
        for i, header in enumerate(headers, 1):
            cell = ws.cell(row=row, column=i, value=header)
            cell.font = header_font
            cell.fill = header_fill
            cell.alignment = Alignment(horizontal='left')
        return row + 1

    def write_kpi_row(ws, row, label, value):
        """Write a KPI label-value pair."""
        label_cell = ws.cell(row=row, column=1, value=label)
        label_cell.font = label_font
        value_cell = ws.cell(row=row, column=2, value=value)
        value_cell.font = value_font
        value_cell.alignment = Alignment(horizontal='left')
        return row + 1

    def format_trend(current, previous):
        """Return formatted trend string and appropriate font."""
        if previous == 0:
            if current > 0:
                return f"{current} (new)", trend_up_font
            return f"{current}", trend_neutral_font
        change_pct = ((current - previous) / previous) * 100
        if change_pct > 0:
            return f"{current} (+{change_pct:.0f}%)", trend_up_font
        elif change_pct < 0:
            return f"{current} ({change_pct:.0f}%)", trend_down_font
        else:
            return f"{current} (0%)", trend_neutral_font

    # --- Build Dashboard ---
    row = 1

    # Title
    title_cell = ws.cell(row=row, column=1, value="SiteHub LinkedIn ICP Tracker")
    title_cell.font = title_font
    row += 1
    subtitle_cell = ws.cell(row=row, column=1, value=f"Dashboard - Updated {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    subtitle_cell.font = Font(name='Calibri', size=10, italic=True, color='666666')
    row += 2

    # Section 1: KPI Summary with Trend Indicators
    row = write_section_header(ws, row, "KPI SUMMARY")
    row += 1
    row = write_kpi_row(ws, row, "Total Contacts", total)
    row = write_kpi_row(ws, row, "ICP Contacts", icp)
    row = write_kpi_row(ws, row, "POI Contacts", poi)
    row = write_kpi_row(ws, row, "Not ICP", not_icp)
    row = write_kpi_row(ws, row, "Pending Review", review)
    row += 1

    # ICP Interactions with trend
    icp_trend_str, icp_trend_font = format_trend(icp_int_this_month, icp_int_last_month)
    ws.cell(row=row, column=1, value="ICP Interactions (this month)").font = label_font
    ws.cell(row=row, column=2, value=icp_trend_str).font = icp_trend_font
    row += 1

    row = write_kpi_row(ws, row, "ICP Interactions (all time)", total_icp_int)
    row = write_kpi_row(ws, row, "Avg Interactions/ICP", f"{avg_engagement:.1f}")
    coverage_str = f"{with_company}/{total} ({with_company/total*100:.0f}%)" if total > 0 else "0/0"
    row = write_kpi_row(ws, row, "Company Coverage", coverage_str)
    row += 2

    # Section 2: Monthly Top Performer
    if exec_stats:
        # Find the winner (highest ICP interactions this month)
        sorted_execs = sorted(exec_stats.items(), key=lambda x: -x[1]["icp_this_month"])
        if sorted_execs and sorted_execs[0][1]["icp_this_month"] > 0:
            winner_name = sorted_execs[0][0]
            winner_stats = sorted_execs[0][1]
            month_name = current_month_start.strftime("%B %Y")

            # Winner header
            for col in range(1, 7):
                ws.cell(row=row, column=col).fill = winner_fill
            ws.cell(row=row, column=1, value=f"MONTHLY TOP PERFORMER - {month_name}").font = winner_header_font
            ws.merge_cells(start_row=row, start_column=1, end_row=row, end_column=6)
            row += 1

            # Winner name
            ws.cell(row=row, column=1, value=winner_name).font = winner_name_font
            row += 1

            # Winner details
            winner_detail = f"{winner_stats['icp_this_month']} ICP interactions from {len(winner_stats['posts_this_month'])} posts | {winner_stats['new_icps_this_month']} new ICP contacts"
            ws.cell(row=row, column=1, value=winner_detail).font = winner_detail_font
            row += 2

    # Section 3: Executive Leaderboard (Enhanced)
    if exec_stats:
        row = write_section_header(ws, row, "EXECUTIVE LEADERBOARD", 6)
        row = write_table_header(ws, row, ["Rank", "Executive", "ICP (Month)", "Trend", "Posts", "New ICPs"])

        sorted_execs = sorted(exec_stats.items(), key=lambda x: -x[1]["icp_this_month"])
        for rank, (exec_name, stats) in enumerate(sorted_execs, 1):
            icp_this = stats["icp_this_month"]
            icp_last = stats["icp_last_month"]
            posts = len(stats["posts_this_month"])
            new_icps = stats["new_icps_this_month"]

            # Calculate trend percentage
            if icp_last > 0:
                trend_pct = ((icp_this - icp_last) / icp_last) * 100
                if trend_pct > 0:
                    trend_str = f"+{trend_pct:.0f}%"
                    trend_font = trend_up_font
                elif trend_pct < 0:
                    trend_str = f"{trend_pct:.0f}%"
                    trend_font = trend_down_font
                else:
                    trend_str = "0%"
                    trend_font = trend_neutral_font
            else:
                trend_str = "new" if icp_this > 0 else "-"
                trend_font = trend_neutral_font

            ws.cell(row=row, column=1, value=rank).font = number_font
            ws.cell(row=row, column=2, value=exec_name).font = label_font
            ws.cell(row=row, column=3, value=icp_this).font = number_font
            ws.cell(row=row, column=4, value=trend_str).font = trend_font
            ws.cell(row=row, column=5, value=posts).font = number_font
            ws.cell(row=row, column=6, value=new_icps).font = number_font
            row += 1
        row += 2

    # Section 4: Posting Frequency
    if exec_stats:
        row = write_section_header(ws, row, "POSTING FREQUENCY", 3)
        row = write_table_header(ws, row, ["Executive", "Posts (30d)", "Days Since Last"])

        # Calculate posts in last 30 days
        thirty_days_ago = today - __import__('datetime').timedelta(days=30)

        for exec_name, stats in sorted(exec_stats.items(), key=lambda x: -len(x[1]["posts_this_month"])):
            # Posts this month approximates posts in last 30 days
            posts_30d = len(stats["posts_this_month"])

            # Days since last post
            last_post = stats["last_post_date"]
            if last_post:
                days_since = (today - last_post).days
                if days_since == 0:
                    days_str = "today"
                elif days_since == 1:
                    days_str = "1 day"
                else:
                    days_str = f"{days_since} days"
            else:
                days_str = "N/A"

            ws.cell(row=row, column=1, value=exec_name).font = label_font
            ws.cell(row=row, column=2, value=posts_30d).font = number_font
            ws.cell(row=row, column=3, value=days_str).font = number_font
            row += 1
        row += 2

    # Section 5: Executive Performance Bar Chart (data table)
    chart_data_start_row = row
    if exec_stats:
        row = write_section_header(ws, row, "EXECUTIVE PERFORMANCE (Chart Data)", 2)
        row = write_table_header(ws, row, ["Executive", "ICP Interactions"])
        chart_data_row_start = row
        sorted_execs_for_chart = sorted(exec_stats.items(), key=lambda x: -x[1]["icp_this_month"])
        for exec_name, stats in sorted_execs_for_chart:
            ws.cell(row=row, column=1, value=exec_name).font = label_font
            ws.cell(row=row, column=2, value=stats["icp_this_month"]).font = number_font
            row += 1
        chart_data_row_end = row - 1

        # Create bar chart
        if chart_data_row_end >= chart_data_row_start:
            bar_chart = BarChart()
            bar_chart.type = "bar"  # Horizontal bars
            bar_chart.style = 10
            bar_chart.title = "Executive ICP Interactions (This Month)"
            # For horizontal bar charts, don't set axis titles - they display awkwardly
            bar_chart.y_axis.title = None
            bar_chart.x_axis.title = None

            data = Reference(ws, min_col=2, min_row=chart_data_row_start - 1, max_row=chart_data_row_end)
            cats = Reference(ws, min_col=1, min_row=chart_data_row_start, max_row=chart_data_row_end)
            bar_chart.add_data(data, titles_from_data=True)
            bar_chart.set_categories(cats)
            bar_chart.shape = 4
            bar_chart.width = 15
            bar_chart.height = 8

            # Add data labels to show numbers on bars
            bar_chart.dataLabels = DataLabelList()
            bar_chart.dataLabels.showVal = True

            ws.add_chart(bar_chart, "H" + str(chart_data_start_row + 1))
        row += 2

    # Section 6: Monthly Trend Line Chart (data table)
    trend_chart_start_row = row
    if monthly_icp_counts:
        row = write_section_header(ws, row, "MONTHLY ICP TREND (Chart Data)", 2)
        row = write_table_header(ws, row, ["Month", "ICP Interactions"])
        trend_data_row_start = row

        # Sort months chronologically
        sorted_months = sorted(monthly_icp_counts.keys())
        for month_key in sorted_months:
            ws.cell(row=row, column=1, value=month_key).font = label_font
            ws.cell(row=row, column=2, value=monthly_icp_counts[month_key]).font = number_font
            row += 1
        trend_data_row_end = row - 1

        # Create line chart
        if trend_data_row_end >= trend_data_row_start:
            line_chart = LineChart()
            line_chart.style = 10
            line_chart.title = "ICP Interactions by Month"
            line_chart.y_axis.title = None
            line_chart.x_axis.title = None
            # Y-axis displays numbers automatically (standard practice for line charts)
            line_chart.y_axis.delete = False

            data = Reference(ws, min_col=2, min_row=trend_data_row_start - 1, max_row=trend_data_row_end)
            cats = Reference(ws, min_col=1, min_row=trend_data_row_start, max_row=trend_data_row_end)
            line_chart.add_data(data, titles_from_data=True)
            line_chart.set_categories(cats)
            line_chart.width = 15
            line_chart.height = 8

            ws.add_chart(line_chart, "H" + str(trend_chart_start_row + 1))
        row += 2

    # Section 7: Status Distribution
    row = write_section_header(ws, row, "STATUS DISTRIBUTION", 3)
    row = write_table_header(ws, row, ["Status", "Count", "Percentage"])
    for status, count in [("ICP", icp), ("POI", poi), ("Not ICP", not_icp), ("Review", review)]:
        pct = f"{count/total*100:.1f}%" if total > 0 else "0%"
        ws.cell(row=row, column=1, value=status).font = label_font
        ws.cell(row=row, column=2, value=count).font = number_font
        ws.cell(row=row, column=3, value=pct).font = number_font
        row += 1
    row += 2

    # Section 8: Recent Activity
    row = write_section_header(ws, row, "RECENT ACTIVITY")
    row += 1
    row = write_kpi_row(ws, row, "Interactions (last 7 days)", int_last_7)
    row = write_kpi_row(ws, row, "Interactions (last 30 days)", int_last_30)
    row = write_kpi_row(ws, row, "Active ICP contacts (7 days)", len(contacts_last_7))
    row = write_kpi_row(ws, row, "Active ICP contacts (30 days)", len(contacts_last_30))
    row += 2

    # Section 9: Top ICP Engagers
    row = write_section_header(ws, row, "TOP ICP ENGAGERS")
    row = write_table_header(ws, row, ["#", "Name", "Company", "Interactions"])
    for j, tc in enumerate(sorted_icp_contacts, 1):
        ws.cell(row=row, column=1, value=j).font = number_font
        ws.cell(row=row, column=2, value=tc["Name"]).font = label_font
        ws.cell(row=row, column=3, value=tc.get("Company", "")).font = label_font
        ws.cell(row=row, column=4, value=tc.get("Total Interactions", 0)).font = number_font
        row += 1
    row += 2

    # Section 10: Top Companies
    row = write_section_header(ws, row, "TOP COMPANIES (by ICP engagement)", 3)
    row = write_table_header(ws, row, ["#", "Company", "ICP Interactions"])
    for j, (comp, cnt) in enumerate(top_companies, 1):
        ws.cell(row=row, column=1, value=j).font = number_font
        ws.cell(row=row, column=2, value=comp).font = label_font
        ws.cell(row=row, column=3, value=cnt).font = number_font
        row += 1
    row += 2

    # Section 11: Top Posts
    row = write_section_header(ws, row, "TOP POSTS (by total reactions)", 3)
    row = write_table_header(ws, row, ["#", "Post Hook", "Reactions"])
    for j, (hook, cnt) in enumerate(top_posts, 1):
        # Truncate long post hooks for display
        display_hook = hook[:60] + "..." if len(hook) > 60 else hook
        ws.cell(row=row, column=1, value=j).font = number_font
        ws.cell(row=row, column=2, value=display_hook).font = label_font
        ws.cell(row=row, column=3, value=cnt).font = number_font
        row += 1
    row += 2

    # Adjust column widths
    ws.column_dimensions['A'].width = 30
    ws.column_dimensions['B'].width = 30
    ws.column_dimensions['C'].width = 20
    ws.column_dimensions['D'].width = 15
    ws.column_dimensions['E'].width = 12
    ws.column_dimensions['F'].width = 12
    ws.column_dimensions['G'].width = 12
    ws.column_dimensions['H'].width = 12

    if verbose:
        print(f"  Dashboard rebuilt: {icp} ICP contacts, {total_icp_int} ICP interactions, {int_last_7} in last 7 days")
        if exec_stats:
            winner = max(exec_stats.items(), key=lambda x: x[1]["icp_this_month"])
            print(f"  Monthly top performer: {winner[0]} with {winner[1]['icp_this_month']} ICP interactions")


# ---------------------------------------------------------------------------
# ICP Contacts Sheet
# ---------------------------------------------------------------------------
def _update_icp_sheet(wb, contacts, interactions, verbose):
    """Create/update the ICP Contacts sheet with aggregated engagement data.

    Shows only ICP-classified contacts, sorted by total interactions (most engaged first).

    Columns:
    1. Name - Contact name
    2. Title - Current job title
    3. Company - Company name
    4. Company Tier - Tier 1/2/3 from reference list
    5. Total Interactions - Count of all interactions
    6. Last 30 Days - Interactions in past 30 days
    7. First Seen - Date of first interaction
    8. Last Active - Date of most recent interaction
    9. Notes - Preserved from Contacts sheet
    """
    # Define sheet name
    sheet_name = "ICP Contacts"

    # Create sheet if it doesn't exist, or get existing
    if sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        # Clear existing content
        merged_ranges = list(ws.merged_cells.ranges)
        for merged_range in merged_ranges:
            ws.unmerge_cells(str(merged_range))
        for row in ws.iter_rows():
            for cell in row:
                try:
                    cell.value = None
                    cell.font = Font()
                    cell.fill = PatternFill()
                    cell.alignment = Alignment()
                except AttributeError:
                    pass
    else:
        # Create new sheet and position it after Dashboard
        ws = wb.create_sheet(title=sheet_name)
        # Move to position after Dashboard
        if "Dashboard" in wb.sheetnames:
            dashboard_idx = wb.sheetnames.index("Dashboard")
            wb.move_sheet(ws, offset=-(len(wb.sheetnames) - dashboard_idx - 2))

    # --- Define styles (matching Dashboard theme) ---
    header_font = Font(name='Calibri', size=11, bold=True, color='FFFFFF')
    header_fill = PatternFill(start_color='2E75B6', end_color='2E75B6', fill_type='solid')
    row_font = Font(name='Calibri', size=11)
    alt_fill = PatternFill(start_color='F2F2F2', end_color='F2F2F2', fill_type='solid')

    # --- Filter to ICP contacts only ---
    icp_contacts = [c for c in contacts if c.get("Status") == "ICP"]

    # --- Calculate per-contact metrics ---
    today = datetime.now().date()

    # Build interaction lookup by contact name
    contact_interactions: dict[str, list] = {}
    for inter in interactions:
        name = (inter.get("Name") or "").strip().lower()
        if name:
            if name not in contact_interactions:
                contact_interactions[name] = []
            contact_interactions[name].append(inter)

    # Calculate Last 30 Days and Top Post for each ICP contact
    icp_data = []
    for c in icp_contacts:
        name_key = c["Name"].lower()
        contact_ints = contact_interactions.get(name_key, [])

        # Count interactions in last 30 days
        last_30_days = 0
        for inter in contact_ints:
            date_val = inter.get("Date")
            if date_val:
                if isinstance(date_val, str):
                    try:
                        date_val = datetime.strptime(date_val, "%Y-%m-%d").date()
                    except ValueError:
                        continue
                elif isinstance(date_val, datetime):
                    date_val = date_val.date()
                if date_val and (today - date_val).days <= 30:
                    last_30_days += 1

        icp_data.append({
            "Name": c["Name"],
            "Title": c.get("Title", ""),
            "Company": c.get("Company", ""),
            "Company Tier": c.get("Company Tier", ""),
            "Total Interactions": c.get("Total Interactions", 0),
            "Last 30 Days": last_30_days,
            "First Seen": c.get("First Interaction"),
            "Last Active": c.get("Latest Interaction"),
            "Notes": c.get("Notes", ""),
        })

    # --- Sort: Primary by Total Interactions (desc), Secondary by Last Active (desc) ---
    def sort_key(d):
        total = d.get("Total Interactions") or 0
        last_active = d.get("Last Active")
        # Convert to comparable format
        if last_active is None:
            last_active_val = datetime.min.date()
        elif isinstance(last_active, datetime):
            last_active_val = last_active.date()
        elif isinstance(last_active, str):
            try:
                last_active_val = datetime.strptime(last_active, "%Y-%m-%d").date()
            except ValueError:
                last_active_val = datetime.min.date()
        else:
            last_active_val = last_active
        return (-total, -last_active_val.toordinal() if last_active_val else 0)

    icp_data.sort(key=sort_key)

    # --- Write headers ---
    headers = ["Name", "Title", "Company", "Company Tier", "Total Interactions",
               "Last 30 Days", "First Seen", "Last Active", "Notes"]
    for col, header in enumerate(headers, 1):
        cell = ws.cell(row=1, column=col, value=header)
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = Alignment(horizontal='left', vertical='center')

    # --- Write data rows ---
    for row_idx, data in enumerate(icp_data, 2):
        for col, header in enumerate(headers, 1):
            value = data.get(header, "")
            cell = ws.cell(row=row_idx, column=col, value=value)
            cell.font = row_font
            cell.alignment = Alignment(horizontal='left', vertical='center')

            # Alternating row colors for readability
            if row_idx % 2 == 0:
                cell.fill = alt_fill

    # --- Auto-fit column widths ---
    column_widths = {
        'A': 25,  # Name
        'B': 35,  # Title
        'C': 25,  # Company
        'D': 12,  # Company Tier
        'E': 16,  # Total Interactions
        'F': 13,  # Last 30 Days
        'G': 12,  # First Seen
        'H': 12,  # Last Active
        'I': 30,  # Notes
    }
    for col_letter, width in column_widths.items():
        ws.column_dimensions[col_letter].width = width

    # Freeze header row
    ws.freeze_panes = 'A2'

    if verbose:
        print(f"  ICP Contacts sheet updated: {len(icp_data)} ICP contacts")


# ---------------------------------------------------------------------------
# Reporting & Insights
# ---------------------------------------------------------------------------
def generate_report(
    tracker_path: str | Path | None = None,
    verbose: bool = True,
    write_to_sheet: bool = False,
) -> dict:
    """Generate engagement insights and trend analysis.

    Outputs:
    - Engagement over time (weekly/monthly ICP interactions)
    - Top engaged ICPs (with recency and consistency metrics)
    - Company-level insights
    - Content performance (by Post Hook)

    Returns a summary dict. Optionally writes to a Reports sheet.
    """
    from collections import defaultdict
    from datetime import timedelta

    tracker_path = Path(tracker_path or TRACKER_PATH)

    if verbose:
        data_source = "PostgreSQL database" if _USE_DB else tracker_path.name
        print(f"=== ICP Engagement Report ===")
        print(f"Data source: {data_source}\n")

    # Load data
    if _USE_DB:
        import icp_db
        session = _get_db_session()
        contacts = icp_db.read_contacts(session)
        interactions = icp_db.read_interactions(session)
    else:
        wb = openpyxl.load_workbook(tracker_path, data_only=True)
        ws_con = wb["Contacts"]
        ws_int = wb["All Interactions"]
        contacts = _read_contacts_sheet(ws_con)
        interactions = _read_interactions_sheet(ws_int)

    # Build contact status lookup
    contact_status = {
        (c.get("Name") or "").strip().lower(): c.get("Status", "")
        for c in contacts
    }
    contact_company = {
        (c.get("Name") or "").strip().lower(): c.get("Company", "")
        for c in contacts
    }

    # Filter to ICP interactions only
    icp_interactions = [
        inter for inter in interactions
        if contact_status.get((inter.get("Name") or "").strip().lower()) == "ICP"
    ]

    if not icp_interactions:
        if verbose:
            print("No ICP interactions found.")
        if not _USE_DB:
            wb.close()
        return {"error": "No ICP interactions"}

    # -------------------------------------------------------------------------
    # 1. Engagement Over Time
    # -------------------------------------------------------------------------
    weekly_counts: dict[str, int] = defaultdict(int)
    monthly_counts: dict[str, int] = defaultdict(int)

    # Find date range
    dates = []
    for inter in icp_interactions:
        d = inter.get("Date")
        if d:
            if isinstance(d, datetime):
                d = d.date()
            dates.append(d)

    if dates:
        min_date = min(dates)
        max_date = max(dates)

        for inter in icp_interactions:
            d = inter.get("Date")
            if d:
                if isinstance(d, datetime):
                    d = d.date()
                # Week key: ISO week
                week_key = d.strftime("%Y-W%W")
                weekly_counts[week_key] += 1
                # Month key
                month_key = d.strftime("%Y-%m")
                monthly_counts[month_key] += 1

        # Calculate trend (compare last 4 weeks vs previous 4 weeks)
        sorted_weeks = sorted(weekly_counts.keys())
        if len(sorted_weeks) >= 8:
            recent_4 = sum(weekly_counts[w] for w in sorted_weeks[-4:])
            previous_4 = sum(weekly_counts[w] for w in sorted_weeks[-8:-4])
            if previous_4 > 0:
                trend_pct = ((recent_4 - previous_4) / previous_4) * 100
                trend_direction = "UP" if trend_pct > 5 else "DOWN" if trend_pct < -5 else "STABLE"
            else:
                trend_pct = 0
                trend_direction = "STABLE"
        else:
            trend_pct = 0
            trend_direction = "STABLE"

        if verbose:
            print("== Engagement Over Time ==")
            print(f"Date range: {min_date} to {max_date}")
            print(f"Total ICP interactions: {len(icp_interactions)}")
            print(f"\nMonthly breakdown:")
            for month in sorted(monthly_counts.keys())[-6:]:
                bar = "#" * (monthly_counts[month] // 5) + "." * max(0, 10 - monthly_counts[month] // 5)
                print(f"  {month}: {monthly_counts[month]:4d} {bar}")
            print(f"\nTrend (last 4 weeks vs previous 4): {trend_direction} {trend_pct:+.1f}%")
            print()

    # -------------------------------------------------------------------------
    # 2. Top Engaged ICPs (enhanced)
    # -------------------------------------------------------------------------
    icp_contacts = [c for c in contacts if c.get("Status") == "ICP"]

    # Calculate engagement consistency (interactions per week active)
    icp_engagement = []
    for c in icp_contacts:
        name = c.get("Name", "")
        name_lower = name.lower()
        total = c.get("Total Interactions", 0)
        first = c.get("First Interaction")
        latest = c.get("Latest Interaction")

        # Calculate weeks between first and latest interaction
        weeks_span = 1
        if first and latest:
            if isinstance(first, datetime):
                first = first.date()
            if isinstance(latest, datetime):
                latest = latest.date()
            days = (latest - first).days
            weeks_span = max(1, days // 7 + 1)

        consistency = total / weeks_span if weeks_span > 0 else total

        # Recency score (interactions in last 30 days)
        recent_count = 0
        today = datetime.now().date()
        for inter in icp_interactions:
            if (inter.get("Name") or "").strip().lower() == name_lower:
                d = inter.get("Date")
                if d:
                    if isinstance(d, datetime):
                        d = d.date()
                    if (today - d).days <= 30:
                        recent_count += 1

        icp_engagement.append({
            "name": name,
            "company": c.get("Company", ""),
            "total": total,
            "recent_30d": recent_count,
            "consistency": round(consistency, 2),
            "first": first,
            "latest": latest,
        })

    # Sort by total interactions
    icp_engagement.sort(key=lambda x: -x["total"])

    if verbose:
        print("== Top 15 Engaged ICPs ==")
        print(f"{'Rank':<5} {'Name':<25} {'Company':<20} {'Total':<6} {'30d':<5} {'Consist.':<8}")
        print("-" * 75)
        for i, e in enumerate(icp_engagement[:15], 1):
            print(f"{i:<5} {e['name'][:24]:<25} {(e['company'] or '')[:19]:<20} "
                  f"{e['total']:<6} {e['recent_30d']:<5} {e['consistency']:<8}")
        print()

    # -------------------------------------------------------------------------
    # 3. Company-Level Insights
    # -------------------------------------------------------------------------
    company_stats: dict[str, dict] = defaultdict(lambda: {
        "icp_count": 0, "interactions": 0, "names": set(), "recent_30d": 0
    })

    today = datetime.now().date()
    for inter in icp_interactions:
        name = (inter.get("Name") or "").strip().lower()
        comp = contact_company.get(name, "") or (inter.get("Company") or "").strip()
        if comp:
            company_stats[comp]["interactions"] += 1
            company_stats[comp]["names"].add(name)
            d = inter.get("Date")
            if d:
                if isinstance(d, datetime):
                    d = d.date()
                if (today - d).days <= 30:
                    company_stats[comp]["recent_30d"] += 1

    for comp in company_stats:
        company_stats[comp]["icp_count"] = len(company_stats[comp]["names"])

    # Sort by interactions
    top_companies = sorted(
        company_stats.items(),
        key=lambda x: -x[1]["interactions"]
    )[:20]

    if verbose:
        print("== Top 20 Companies by ICP Engagement ==")
        print(f"{'Rank':<5} {'Company':<30} {'ICPs':<6} {'Total':<7} {'30d':<5}")
        print("-" * 55)
        for i, (comp, stats) in enumerate(top_companies, 1):
            print(f"{i:<5} {comp[:29]:<30} {stats['icp_count']:<6} "
                  f"{stats['interactions']:<7} {stats['recent_30d']:<5}")
        print()

    # New companies (first interaction in last 30 days)
    new_companies = []
    for comp, stats in company_stats.items():
        # Find earliest interaction for this company
        earliest = None
        for inter in icp_interactions:
            name = (inter.get("Name") or "").strip().lower()
            inter_comp = contact_company.get(name, "") or (inter.get("Company") or "").strip()
            if inter_comp == comp:
                d = inter.get("Date")
                if d:
                    if isinstance(d, datetime):
                        d = d.date()
                    if earliest is None or d < earliest:
                        earliest = d
        if earliest and (today - earliest).days <= 30:
            new_companies.append((comp, stats["interactions"], earliest))

    if new_companies and verbose:
        new_companies.sort(key=lambda x: x[2], reverse=True)
        print("== New Companies (first ICP interaction in last 30 days) ==")
        for comp, cnt, first_date in new_companies[:10]:
            print(f"  {comp}: {cnt} interactions (since {first_date})")
        print()

    # -------------------------------------------------------------------------
    # 4. Content Performance (by Post Hook)
    # -------------------------------------------------------------------------
    post_stats: dict[str, dict] = defaultdict(lambda: {
        "total": 0, "comments": 0, "reactions": 0, "unique_icps": set(), "date": None
    })

    for inter in icp_interactions:
        hook = (inter.get("Post Hook") or "").strip()
        if hook:
            post_stats[hook]["total"] += 1
            inter_type = (inter.get("Type") or "").lower()
            if "comment" in inter_type:
                post_stats[hook]["comments"] += 1
            else:
                post_stats[hook]["reactions"] += 1
            name = (inter.get("Name") or "").strip().lower()
            post_stats[hook]["unique_icps"].add(name)
            d = inter.get("Date")
            if d and (post_stats[hook]["date"] is None or d > post_stats[hook]["date"]):
                post_stats[hook]["date"] = d

    # Convert sets to counts
    for hook in post_stats:
        post_stats[hook]["unique_icps"] = len(post_stats[hook]["unique_icps"])

    # Sort by total engagement
    top_posts = sorted(
        post_stats.items(),
        key=lambda x: -x[1]["total"]
    )[:15]

    if verbose:
        print("== Top 15 Posts by ICP Engagement ==")
        print(f"{'Rank':<5} {'Post Hook':<35} {'Total':<6} {'ICPs':<6} {'Comments':<9}")
        print("-" * 65)
        for i, (hook, stats) in enumerate(top_posts, 1):
            hook_display = hook[:34] if len(hook) <= 34 else hook[:31] + "..."
            print(f"{i:<5} {hook_display:<35} {stats['total']:<6} "
                  f"{stats['unique_icps']:<6} {stats['comments']:<9}")
        print()

    # -------------------------------------------------------------------------
    # 5. Optionally write to Reports sheet
    # -------------------------------------------------------------------------
    if write_to_sheet and not _USE_DB:
        if "Reports" not in wb.sheetnames:
            wb.create_sheet("Reports")
        ws_rep = wb["Reports"]

        # Clear existing content
        for row in ws_rep.iter_rows():
            for cell in row:
                cell.value = None

        row = 1

        # Header
        ws_rep.cell(row=row, column=1, value="ICP Engagement Report")
        ws_rep.cell(row=row, column=2, value=f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
        row += 2

        # Summary stats
        ws_rep.cell(row=row, column=1, value="Summary")
        row += 1
        ws_rep.cell(row=row, column=1, value="Total ICP Interactions")
        ws_rep.cell(row=row, column=2, value=len(icp_interactions))
        row += 1
        ws_rep.cell(row=row, column=1, value="Total ICP Contacts")
        ws_rep.cell(row=row, column=2, value=len(icp_contacts))
        row += 1
        if dates:
            ws_rep.cell(row=row, column=1, value="Date Range")
            ws_rep.cell(row=row, column=2, value=f"{min_date} to {max_date}")
        row += 2

        # Monthly breakdown
        ws_rep.cell(row=row, column=1, value="Monthly ICP Interactions")
        row += 1
        ws_rep.cell(row=row, column=1, value="Month")
        ws_rep.cell(row=row, column=2, value="Count")
        row += 1
        for month in sorted(monthly_counts.keys()):
            ws_rep.cell(row=row, column=1, value=month)
            ws_rep.cell(row=row, column=2, value=monthly_counts[month])
            row += 1
        row += 1

        # Top ICPs
        ws_rep.cell(row=row, column=1, value="Top Engaged ICPs")
        row += 1
        ws_rep.cell(row=row, column=1, value="Name")
        ws_rep.cell(row=row, column=2, value="Company")
        ws_rep.cell(row=row, column=3, value="Total")
        ws_rep.cell(row=row, column=4, value="Last 30d")
        ws_rep.cell(row=row, column=5, value="Consistency")
        row += 1
        for e in icp_engagement[:15]:
            ws_rep.cell(row=row, column=1, value=e["name"])
            ws_rep.cell(row=row, column=2, value=e["company"])
            ws_rep.cell(row=row, column=3, value=e["total"])
            ws_rep.cell(row=row, column=4, value=e["recent_30d"])
            ws_rep.cell(row=row, column=5, value=e["consistency"])
            row += 1
        row += 1

        # Top Companies
        ws_rep.cell(row=row, column=1, value="Top Companies by ICP Engagement")
        row += 1
        ws_rep.cell(row=row, column=1, value="Company")
        ws_rep.cell(row=row, column=2, value="ICP Contacts")
        ws_rep.cell(row=row, column=3, value="Total Interactions")
        ws_rep.cell(row=row, column=4, value="Last 30d")
        row += 1
        for comp, stats in top_companies:
            ws_rep.cell(row=row, column=1, value=comp)
            ws_rep.cell(row=row, column=2, value=stats["icp_count"])
            ws_rep.cell(row=row, column=3, value=stats["interactions"])
            ws_rep.cell(row=row, column=4, value=stats["recent_30d"])
            row += 1
        row += 1

        # Top Posts
        ws_rep.cell(row=row, column=1, value="Top Posts by ICP Engagement")
        row += 1
        ws_rep.cell(row=row, column=1, value="Post Hook")
        ws_rep.cell(row=row, column=2, value="Total")
        ws_rep.cell(row=row, column=3, value="Unique ICPs")
        ws_rep.cell(row=row, column=4, value="Comments")
        row += 1
        for hook, stats in top_posts:
            ws_rep.cell(row=row, column=1, value=hook)
            ws_rep.cell(row=row, column=2, value=stats["total"])
            ws_rep.cell(row=row, column=3, value=stats["unique_icps"])
            ws_rep.cell(row=row, column=4, value=stats["comments"])
            row += 1

        wb.save(tracker_path)
        if verbose:
            print(f"Report written to 'Reports' sheet in {tracker_path.name}")

    if not _USE_DB:
        wb.close()

    return {
        "total_icp_interactions": len(icp_interactions),
        "total_icp_contacts": len(icp_contacts),
        "trend_direction": trend_direction if dates else None,
        "trend_pct": trend_pct if dates else None,
        "top_companies": [(c, s["interactions"]) for c, s in top_companies[:10]],
        "top_posts": [(h, s["total"]) for h, s in top_posts[:10]],
    }


# ---------------------------------------------------------------------------
# Pattern Analysis — learn from manual classifications
# ---------------------------------------------------------------------------
def analyze_classifications(
    tracker_path: str | Path | None = None,
    reference_path: str | Path | None = None,
    verbose: bool = True,
) -> dict:
    """Analyze manually classified contacts to suggest reference list additions.

    Reads the Contacts sheet, groups by Status, and for ICP contacts:
    - Lists unique companies (sorted by frequency)
    - Extracts common title words/patterns
    - Compares against existing reference lists
    - Outputs suggestions for expansion
    """
    tracker_path = Path(tracker_path or TRACKER_PATH)
    reference_path = Path(reference_path or REFERENCE_PATH)

    if verbose:
        print(f"=== Analyzing Classifications in {tracker_path.name} ===\n")

    # Load tracker contacts
    wb = openpyxl.load_workbook(tracker_path, read_only=True, data_only=True)
    ws = wb["Contacts"]
    contacts = _read_contacts_sheet(ws)
    wb.close()

    # Load existing reference lists for comparison
    ref = load_reference_lists(reference_path)
    existing_companies = set(ref["companies"].keys())
    existing_title_patterns = {jt["pattern"] for jt in ref["job_titles"]}

    # Group contacts by status
    by_status = {"ICP": [], "POI": [], "Not ICP": [], "Review": []}
    for c in contacts:
        status = c.get("Status", "Review")
        if status in by_status:
            by_status[status].append(c)

    if verbose:
        print("Contact Distribution:")
        for status, lst in by_status.items():
            print(f"  {status}: {len(lst)}")
        print()

    # Analyze ICP contacts
    icp_contacts = by_status["ICP"]
    if not icp_contacts:
        if verbose:
            print("No ICP contacts to analyze.")
        return {"companies": [], "title_patterns": []}

    # 1. Company frequency analysis
    company_freq: dict[str, int] = {}
    for c in icp_contacts:
        comp = (c.get("Company") or "").strip()
        if comp:
            company_freq[comp] = company_freq.get(comp, 0) + 1

    # Sort by frequency, filter to 2+ occurrences
    companies_sorted = sorted(company_freq.items(), key=lambda x: -x[1])
    suggested_companies = [
        (comp, cnt) for comp, cnt in companies_sorted
        if cnt >= 2 and comp.lower() not in existing_companies
    ]

    if verbose:
        print("=== ICP Company Analysis ===")
        print(f"Unique companies among ICP contacts: {len(company_freq)}")
        print(f"\nTop 20 ICP companies (all):")
        for comp, cnt in companies_sorted[:20]:
            in_ref = "[OK]" if comp.lower() in existing_companies else "NEW"
            print(f"  {cnt:3d}x  {comp}  [{in_ref}]")

        if suggested_companies:
            print(f"\n--- Suggested Additions to Companies sheet (2+ occurrences, not in reference) ---")
            for comp, cnt in suggested_companies[:15]:
                print(f"  {cnt:3d}x  {comp}")
        print()

    # 2. Title keyword analysis
    # Extract meaningful words from ICP titles
    stopwords = {
        "and", "or", "the", "a", "an", "of", "in", "at", "to", "for", "with",
        "og", "i", "hos", "til", "med", "på", "ved", "af", "|", "-", "–", "—",
        "&", "/", ".", ",", "(", ")", "[", "]",
    }
    word_freq: dict[str, int] = {}
    for c in icp_contacts:
        title = (c.get("Title") or "").lower()
        # Tokenize
        words = re.findall(r"[a-zæøå]+", title)
        for w in words:
            if len(w) >= 3 and w not in stopwords:
                word_freq[w] = word_freq.get(w, 0) + 1

    words_sorted = sorted(word_freq.items(), key=lambda x: -x[1])

    # Find words that appear frequently but aren't in existing patterns
    suggested_patterns = []
    for word, cnt in words_sorted:
        if cnt >= 3:
            # Check if word is already covered by existing patterns
            covered = any(word in pat or pat in word for pat in existing_title_patterns)
            if not covered:
                suggested_patterns.append((word, cnt))

    if verbose:
        print("=== ICP Title Keyword Analysis ===")
        print(f"Top 30 title keywords among ICP contacts:")
        for word, cnt in words_sorted[:30]:
            covered = any(word in pat or pat in word for pat in existing_title_patterns)
            status = "[OK]" if covered else "NEW"
            print(f"  {cnt:3d}x  {word}  [{status}]")

        if suggested_patterns:
            print(f"\n--- Suggested Title Patterns (3+ occurrences, not covered) ---")
            for word, cnt in suggested_patterns[:15]:
                print(f"  {cnt:3d}x  {word}")
        print()

    # 3. POI analysis (contacts that are close but not ICP)
    poi_contacts = by_status["POI"]
    if poi_contacts and verbose:
        print("=== POI Analysis (potential upgrades) ===")
        poi_company_freq: dict[str, int] = {}
        for c in poi_contacts:
            comp = (c.get("Company") or "").strip()
            if comp:
                poi_company_freq[comp] = poi_company_freq.get(comp, 0) + 1
        poi_companies_sorted = sorted(poi_company_freq.items(), key=lambda x: -x[1])
        print(f"Top 10 POI companies:")
        for comp, cnt in poi_companies_sorted[:10]:
            print(f"  {cnt:3d}x  {comp}")
        print()

    summary = {
        "icp_count": len(icp_contacts),
        "unique_companies": len(company_freq),
        "suggested_companies": suggested_companies[:15],
        "suggested_patterns": suggested_patterns[:15],
    }

    if verbose:
        print("=== Summary ===")
        print(f"  ICP contacts analyzed: {len(icp_contacts)}")
        print(f"  Suggested company additions: {len(suggested_companies)}")
        print(f"  Suggested title patterns: {len(suggested_patterns)}")

    return summary


# ---------------------------------------------------------------------------
# Main entry point for new data
# ---------------------------------------------------------------------------
def process_new_data(
    raw_text: str,
    tracker_path: str | Path | None = None,
    reference_path: str | Path | None = None,
    use_web: bool = False,
    use_cvr: bool = False,
    minimal_mode: bool = False,
) -> dict:
    """Full pipeline: parse → enrich → classify → update tracker."""
    print("Parsing raw data...")
    new_interactions = parse_raw_data(raw_text, minimal_mode=minimal_mode)
    print(f"  Parsed {len(new_interactions)} interactions")
    new_interactions = validate_parsed_interactions(new_interactions)

    return update_tracker(
        tracker_path=tracker_path,
        reference_path=reference_path,
        new_interactions=new_interactions,
        use_web=use_web,
        use_cvr=use_cvr,
    )


# ---------------------------------------------------------------------------
# Import manual classifications from external file
# ---------------------------------------------------------------------------
def import_manual_classifications(
    source_path: str | Path,
    tracker_path: str | Path | None = None,
    verbose: bool = True,
) -> dict:
    """Import manual Status classifications from an external Excel file.

    Reads the source file's TBL_iLog sheet (or first sheet with Status column),
    extracts name->status mappings, and updates the tracker's Contacts sheet.
    """
    source_path = Path(source_path)
    tracker_path = Path(tracker_path or TRACKER_PATH)

    if verbose:
        print(f"=== Importing Manual Classifications ===")
        print(f"  Source: {source_path.name}")
        print(f"  Target: {tracker_path.name}\n")

    # 1. Read source file — extract name -> status mappings
    wb_src = openpyxl.load_workbook(source_path, read_only=True, data_only=True)

    # Try TBL_iLog first, then first sheet
    if "TBL_iLog" in wb_src.sheetnames:
        ws_src = wb_src["TBL_iLog"]
    else:
        ws_src = wb_src.active

    headers = [cell.value for cell in ws_src[1]]

    # Find Status and Name columns
    status_idx = None
    name_idx = None
    for i, h in enumerate(headers):
        if h and "status" in str(h).lower():
            status_idx = i
        if h and str(h).lower() in ("name", "name_clean"):
            name_idx = i

    if status_idx is None or name_idx is None:
        wb_src.close()
        raise ValueError(f"Could not find Name and Status columns in {source_path.name}")

    # Build name -> status map (use latest status per name)
    manual_status: dict[str, str] = {}
    for row in ws_src.iter_rows(min_row=2, values_only=True):
        name = row[name_idx] if name_idx < len(row) else None
        status = row[status_idx] if status_idx < len(row) else None
        if name and status:
            name_key = str(name).strip().lower()
            manual_status[name_key] = str(status).strip()

    wb_src.close()

    if verbose:
        from collections import Counter
        counts = Counter(manual_status.values())
        print(f"Loaded {len(manual_status)} manual classifications:")
        for s, c in sorted(counts.items(), key=lambda x: -x[1]):
            print(f"  {s}: {c}")
        print()

    # 2. Update tracker Contacts
    if _USE_DB:
        from icp_db import update_contact_status
        session = _get_db_session()
        updated = 0
        for name_key, new_status in manual_status.items():
            if update_contact_status(session, name_key, new_status):
                updated += 1
    else:
        wb = openpyxl.load_workbook(tracker_path)
        ws = wb["Contacts"]
        headers = [cell.value for cell in ws[1]]

        name_col = headers.index("Name") + 1
        status_col = headers.index("Status") + 1

        updated = 0
        for row_idx in range(2, ws.max_row + 1):
            name_cell = ws.cell(row=row_idx, column=name_col).value
            if not name_cell:
                continue
            name_key = str(name_cell).strip().lower()
            if name_key in manual_status:
                new_status = manual_status[name_key]
                old_status = ws.cell(row=row_idx, column=status_col).value
                if old_status != new_status:
                    ws.cell(row=row_idx, column=status_col, value=new_status)
                    updated += 1

        wb.save(tracker_path)
        wb.close()

    if verbose:
        print(f"Updated {updated} contact statuses in tracker")

    return {"imported": len(manual_status), "updated": updated}


# ---------------------------------------------------------------------------
# Phase 5: Review Queue + Interactive Mode
# ---------------------------------------------------------------------------
def _show_review_queue(contacts: list[dict], limit: int = 20) -> list[dict]:
    """Show contacts that need review with reasons.

    Returns the list of contacts needing review.
    """
    review_contacts = [
        c for c in contacts
        if c.get("Status") == "Review"
    ]

    if not review_contacts:
        print("\nNo contacts need review. All classified.")
        return []

    print(f"\n=== Review Queue ({len(review_contacts)} contacts) ===")
    print(f"{'#':<4} {'Name':<25} {'Company':<20} {'Title':<35}")
    print("-" * 86)

    for i, c in enumerate(review_contacts[:limit], 1):
        name = (c.get("Name") or "")[:24]
        company = (c.get("Company") or "")[:19]
        title = (c.get("Title") or "")[:34]
        print(f"{i:<4} {name:<25} {company:<20} {title:<35}")

    if len(review_contacts) > limit:
        print(f"  ... and {len(review_contacts) - limit} more")

    return review_contacts


def interactive_review_mode(
    tracker_path: str | Path | None = None,
    reference_path: str | Path | None = None,
    verbose: bool = True,
) -> dict:
    """Interactive mode for quickly classifying Review contacts.

    Displays contacts one by one and allows quick classification:
        1 = ICP
        2 = POI
        3 = Not ICP
        s = skip
        q = quit

    Saves after each decision.
    """
    tracker_path = Path(tracker_path or TRACKER_PATH)

    if verbose:
        print(f"=== Interactive Review Mode ===")
        data_source = "PostgreSQL database" if _USE_DB else tracker_path.name
        print(f"Tracker: {data_source}")
        print()
        print("Commands:")
        print("  1 = ICP")
        print("  2 = POI")
        print("  3 = Not ICP")
        print("  c = set company (then classify)")
        print("  s = skip")
        print("  q = quit")
        print()

    # ===================== DB MODE =====================
    if _USE_DB:
        import icp_db
        session = _get_db_session()
        review_contacts = icp_db.get_review_contacts(session)
        review_rows = [
            {
                "name": c["Name"],
                "title": c["Title"],
                "company": c["Company"],
            }
            for c in review_contacts
        ]
    else:
        # ===================== EXCEL MODE =====================
        wb = openpyxl.load_workbook(tracker_path)
        ws = wb["Contacts"]
        headers = [cell.value for cell in ws[1]]

        name_col = headers.index("Name") + 1
        title_col = headers.index("Title") + 1 if "Title" in headers else None
        company_col = headers.index("Company") + 1 if "Company" in headers else None
        status_col = headers.index("Status") + 1

        review_rows = []
        for row_idx in range(2, ws.max_row + 1):
            status = ws.cell(row=row_idx, column=status_col).value
            if status == "Review":
                name = ws.cell(row=row_idx, column=name_col).value or ""
                title = ws.cell(row=row_idx, column=title_col).value if title_col else ""
                company = ws.cell(row=row_idx, column=company_col).value if company_col else ""
                review_rows.append({
                    "row_idx": row_idx,
                    "name": name,
                    "title": title or "",
                    "company": company or "",
                })

    if not review_rows:
        print("No contacts need review. All classified.")
        if not _USE_DB:
            wb.close()
        return {"reviewed": 0, "skipped": 0}

    print(f"Found {len(review_rows)} contacts needing review.\n")

    classified = 0
    skipped = 0

    for i, contact in enumerate(review_rows, 1):
        print(f"[{i}/{len(review_rows)}] {contact['name']}")
        print(f"  Title:   {contact['title']}")
        print(f"  Company: {contact['company'] or '(unknown)'}")

        # Show learned suggestion if available
        suggested, reason, confidence = get_learned_suggestion(contact['title'], contact['company'])
        if suggested and confidence >= 0.5:
            conf_pct = int(confidence * 100)
            print(f"  ** Suggested: {suggested} ({conf_pct}% confidence) - {reason}")

        while True:
            try:
                choice = input("  Classify [1/2/3/c/s/q]: ").strip().lower()
            except (EOFError, KeyboardInterrupt):
                choice = "q"

            if choice in ("1", "2", "3"):
                status_map = {"1": "ICP", "2": "POI", "3": "Not ICP"}
                new_status = status_map[choice]

                if _USE_DB:
                    icp_db.update_contact_status(session, contact["name"], new_status)
                else:
                    ws.cell(row=contact["row_idx"], column=status_col, value=new_status)
                    wb.save(tracker_path)

                print(f"  -> {new_status}")
                learn_from_classification(contact["name"], contact["title"], contact["company"], new_status)
                classified += 1
                break
            elif choice == "c":
                old_company = contact["company"]
                try:
                    new_company = input("  Enter company: ").strip()
                except (EOFError, KeyboardInterrupt):
                    print("  -> company entry cancelled")
                    continue
                if new_company:
                    if old_company and old_company.lower() != new_company.lower():
                        _add_to_company_blocklist(old_company)
                        print(f"  -> Added '{old_company}' to blocklist")

                    if _USE_DB:
                        icp_db.update_contact_status(session, contact["name"], "Review", company=new_company)
                    else:
                        ws.cell(row=contact["row_idx"], column=company_col, value=new_company)
                        wb.save(tracker_path)

                    contact["company"] = new_company
                    print(f"  -> Company set to: {new_company}")

                    # Check if this is a SiteHub employee
                    if _SITEHUB_RE.search(new_company):
                        print(f"  ** SiteHub detected! **")
                        try:
                            confirm = input("  Mark as SiteHub employee and filter out? [y/n]: ").strip().lower()
                        except (EOFError, KeyboardInterrupt):
                            confirm = "n"

                        if confirm == "y":
                            if _USE_DB:
                                icp_db.update_contact_status(
                                    session, contact["name"], "Not ICP",
                                    notes="SiteHub employee",
                                )
                            else:
                                ws.cell(row=contact["row_idx"], column=status_col, value="Not ICP")
                                if "Notes" in headers:
                                    notes_col = headers.index("Notes") + 1
                                    ws.cell(row=contact["row_idx"], column=notes_col, value="SiteHub employee")
                                wb.save(tracker_path)
                            print("  -> Marked as SiteHub employee (Not ICP)")

                            try:
                                add_to_list = input("  Add to permanent SiteHub employees list? [y/n]: ").strip().lower()
                            except (EOFError, KeyboardInterrupt):
                                add_to_list = "n"

                            if add_to_list == "y":
                                if _add_sitehub_employee(contact["name"]):
                                    print(f"  -> Added '{contact['name']}' to permanent employee filter")
                                    storage = "database" if _USE_DB else _SITEHUB_EMPLOYEES_FILE.name
                                    print(f"     (Saved to {storage})")
                                else:
                                    print(f"  -> '{contact['name']}' already in employee list")

                            classified += 1
                            break  # Move to next contact
                else:
                    print("  -> no company entered, still (unknown)")
                # Don't break - continue loop to get classification
            elif choice == "s":
                print("  -> skipped")
                skipped += 1
                break
            elif choice == "q":
                print("\nQuitting review mode...")
                if not _USE_DB:
                    wb.close()
                return {"reviewed": classified, "skipped": skipped}
            else:
                print("  Invalid choice. Use 1, 2, 3, c, s, or q.")

        print()

    if not _USE_DB:
        wb.close()

    print(f"\n=== Review Complete ===")
    print(f"  Classified: {classified}")
    print(f"  Skipped: {skipped}")

    return {"reviewed": classified, "skipped": skipped}


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="SiteHub LinkedIn ICP Tracker — processing script",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python icp_process.py --backfill           # Re-enrich & update Dashboard
  python icp_process.py --analyze            # Suggest reference list additions
  python icp_process.py --report             # Generate engagement insights
  python icp_process.py --report --save      # Write report to Excel sheet
  python icp_process.py --input data.csv     # Process new data from file
  python icp_process.py --quick "John Doe" "CEO at Acme Inc"
  python icp_process.py --minimal            # Simplified paste mode
  python icp_process.py --likes              # LinkedIn likes format (Name/Title alternating)
  python icp_process.py --review             # Interactive review mode
        """
    )
    parser.add_argument(
        "--backfill",
        action="store_true",
        help="Run enrichment + re-classification on existing data (no new data)",
    )
    parser.add_argument(
        "--analyze",
        action="store_true",
        help="Analyze manual classifications to suggest reference list additions",
    )
    parser.add_argument(
        "--report",
        action="store_true",
        help="Generate engagement insights report (trends, top ICPs, companies, content)",
    )
    parser.add_argument(
        "--save",
        action="store_true",
        help="When used with --report, write report to 'Reports' sheet in tracker",
    )
    parser.add_argument(
        "--quick",
        nargs="+",
        metavar=("NAME", "TITLE"),
        help='Quick-add a single entry: --quick "Name" "Title" ["PostHook"]',
    )
    parser.add_argument(
        "--minimal",
        action="store_true",
        help="Use minimal input format: Name<tab>Title[<tab>Company][<tab>PostHook]",
    )
    parser.add_argument(
        "--import-status",
        type=str,
        dest="import_status",
        metavar="FILE",
        help="Import manual Status classifications from an external Excel file",
    )
    parser.add_argument(
        "--input", "-i",
        type=str,
        help="Path to a file containing new LinkedIn data (tab or comma separated)",
    )
    parser.add_argument(
        "--tracker",
        type=str,
        default=str(TRACKER_PATH),
        help="Path to SiteHub_LinkedIn_ICP_Tracker.xlsx",
    )
    parser.add_argument(
        "--reference",
        type=str,
        default=str(REFERENCE_PATH),
        help="Path to ICP_Reference_Lists.xlsx",
    )
    parser.add_argument(
        "--no-web",
        action="store_true",
        dest="no_web",
        help="Disable web search for company enrichment (enabled by default)",
    )
    parser.add_argument(
        "--no-cvr",
        action="store_true",
        dest="no_cvr",
        help="Disable CVR API lookup (enabled by default, 50 free lookups/day)",
    )
    parser.add_argument(
        "--likes",
        action="store_true",
        help="Parse LinkedIn likes format (alternating Name/Title lines)",
    )
    parser.add_argument(
        "--post-hook",
        type=str,
        dest="post_hook",
        default="",
        help="Post hook to associate with imported entries (used with --likes)",
    )
    parser.add_argument(
        "--review",
        action="store_true",
        help="Interactive review mode: classify contacts with Status='Review'",
    )
    parser.add_argument(
        "--no-auto-review",
        action="store_true",
        dest="no_auto_review",
        help="Skip automatic review mode after processing (for non-interactive use)",
    )
    parser.add_argument(
        "--set-source",
        type=str,
        dest="set_source",
        metavar="NAME",
        help="Set Source Profile for all interactions with empty Source Profile",
    )
    parser.add_argument(
        "--use-db",
        action="store_true",
        dest="use_db",
        help="Use PostgreSQL database instead of Excel for data storage",
    )
    parser.add_argument(
        "--zoho-sync",
        action="store_true",
        dest="zoho_sync",
        help="Sync all unsynced ICP contacts to Zoho CRM (requires --use-db)",
    )

    args = parser.parse_args()

    # --- DB mode setup ---
    if args.use_db:
        global _USE_DB
        _USE_DB = True
        try:
            from icp_db import get_session, check_db_health
            session = get_session()
            health = check_db_health(session)
            print(f"DB mode: connected ({health['contacts']} contacts, {health['interactions']} interactions)")
            session.close()
        except Exception as e:
            print(f"ERROR: Could not connect to database: {e}")
            print("Make sure PostgreSQL is running: docker compose up -d")
            return
        # Reload employees/patterns from DB
        _load_additional_sitehub_employees()
        _load_learned_patterns()

    if args.zoho_sync:
        if not _USE_DB:
            print("ERROR: --zoho-sync requires --use-db flag")
            return
        from zoho_sync import sync_all_unsynced
        from zoho_client import is_configured
        if not is_configured():
            print("ERROR: Zoho CRM not configured. Set ZOHO_CLIENT_ID, ZOHO_CLIENT_SECRET, ZOHO_REFRESH_TOKEN.")
            return
        print("=== Zoho CRM Sync ===\n")
        from icp_db import get_session
        session = get_session()
        try:
            result = sync_all_unsynced(session)
            print(f"  Synced:  {result['synced']}")
            print(f"  Errors:  {result['errors']}")
            print(f"  Skipped: {result['skipped']}")
            print(f"  Total:   {result['total']}")
        finally:
            session.close()
        return

    if args.report:
        # Generate insights report
        generate_report(
            tracker_path=args.tracker,
            write_to_sheet=args.save,
        )
    elif args.quick:
        # Quick-add single entry
        if len(args.quick) < 2:
            print("Error: --quick requires at least NAME and TITLE")
            print('Usage: --quick "John Doe" "CEO at Acme Inc" ["PostHook"]')
            return

        name = args.quick[0]
        title = args.quick[1]
        post_hook = args.quick[2] if len(args.quick) > 2 else ""

        print(f"=== Quick Add Mode ===")
        entry = quick_add_entry(name, title, post_hook)
        print(f"  Name: {entry['Name']}")
        print(f"  Title: {entry['Title']}")
        print(f"  Company: {entry['Company'] or '(not extracted)'}")
        print(f"  Date: {entry['Date']}")
        if post_hook:
            print(f"  Post Hook: {post_hook}")
        print()

        update_tracker(
            tracker_path=args.tracker,
            reference_path=args.reference,
            new_interactions=[entry],
            use_web=not args.no_web,
            use_cvr=not args.no_cvr,
        )
    elif args.import_status:
        import_manual_classifications(
            source_path=args.import_status,
            tracker_path=args.tracker,
        )
        # After importing, run backfill to update Dashboard
        print("\nRunning backfill to update Dashboard...\n")
        update_tracker(
            tracker_path=args.tracker,
            reference_path=args.reference,
            use_web=not args.no_web,
            use_cvr=not args.no_cvr,
        )
    elif args.analyze:
        analyze_classifications(
            tracker_path=args.tracker,
            reference_path=args.reference,
        )
    elif args.backfill:
        print("=== Backfill Mode: enrich & re-classify existing data ===\n")
        update_tracker(
            tracker_path=args.tracker,
            reference_path=args.reference,
            use_web=not args.no_web,
            use_cvr=not args.no_cvr,
        )
    elif args.input:
        print(f"=== Processing new data from {args.input} ===\n")
        raw = Path(args.input).read_text(encoding="utf-8")
        summary = process_new_data(
            raw,
            tracker_path=args.tracker,
            reference_path=args.reference,
            use_web=not args.no_web,
            use_cvr=not args.no_cvr,
            minimal_mode=args.minimal,
        )
        # Auto-enter review mode if there are items needing review (unless --no-auto-review)
        if summary.get("Review", 0) > 0 and not args.no_auto_review:
            print("\n" + "=" * 50)
            print("Entering review mode for new contacts...")
            print("=" * 50 + "\n")
            interactive_review_mode(
                tracker_path=args.tracker,
                reference_path=args.reference,
            )
    elif args.review:
        # Interactive review mode
        interactive_review_mode(
            tracker_path=args.tracker,
            reference_path=args.reference,
        )
    elif args.set_source:
        # Backfill empty Source Profile values
        backfill_source_profile(
            tracker_path=args.tracker,
            source_name=args.set_source,
        )
        # Refresh Dashboard to reflect updated Source Profile data
        update_tracker(
            tracker_path=args.tracker,
            reference_path=args.reference,
            use_web=not args.no_web,
            use_cvr=not args.no_cvr,
        )
    else:
        # Interactive mode
        source_profile = ""
        if args.likes:
            mode_name = "LinkedIn Likes"
            print(f"=== Interactive Mode ({mode_name}) ===")
            print("Format: Alternating Name/Title lines copied from LinkedIn reactions")
            if args.post_hook:
                print(f"  Post Hook: {args.post_hook}")
            print()
            # Prompt for executive name (source profile)
            source_profile = input("Enter executive name (or press Enter to skip): ").strip()
            if source_profile:
                print(f"  Source Profile: {source_profile}")
            print()
            print("Paste LinkedIn data (end with empty line):")
        elif args.minimal:
            mode_name = "Minimal"
            print(f"=== Interactive Mode ({mode_name}): paste data (end with empty line) ===")
            print("Format: Name<tab>Title[<tab>Company][<tab>PostHook]")
        else:
            mode_name = "Standard"
            print(f"=== Interactive Mode ({mode_name}): paste data (end with empty line) ===")
        print()

        lines = []
        try:
            while True:
                line = input()
                if line.strip() == "":
                    break
                lines.append(line)
        except EOFError:
            pass

        if lines:
            raw = "\n".join(lines)

            if args.likes:
                # Parse using LinkedIn likes format
                print("Parsing LinkedIn likes format...")
                new_interactions = parse_linkedin_likes(raw, post_hook=args.post_hook, source_profile=source_profile)
                print(f"  Parsed {len(new_interactions)} interactions")
                new_interactions = validate_parsed_interactions(new_interactions)

                if new_interactions:
                    summary = update_tracker(
                        tracker_path=args.tracker,
                        reference_path=args.reference,
                        new_interactions=new_interactions,
                        use_web=not args.no_web,
                        use_cvr=not args.no_cvr,
                    )
                    # Auto-enter review mode if there are items needing review
                    if summary.get("Review", 0) > 0:
                        print("\n" + "=" * 50)
                        print("Entering review mode for new contacts...")
                        print("=" * 50 + "\n")
                        interactive_review_mode(
                            tracker_path=args.tracker,
                            reference_path=args.reference,
                        )
            else:
                summary = process_new_data(
                    raw,
                    tracker_path=args.tracker,
                    reference_path=args.reference,
                    use_web=not args.no_web,
                    use_cvr=not args.no_cvr,
                    minimal_mode=args.minimal,
                )
                # Auto-enter review mode if there are items needing review
                if summary.get("Review", 0) > 0:
                    print("\n" + "=" * 50)
                    print("Entering review mode for new contacts...")
                    print("=" * 50 + "\n")
                    interactive_review_mode(
                        tracker_path=args.tracker,
                        reference_path=args.reference,
                    )
        else:
            print("No data provided. Use --backfill to process existing data.")


# ---------------------------------------------------------------------------
# Test: Verify Company Identification Flow
# ---------------------------------------------------------------------------
def test_company_flow(verbose: bool = True) -> bool:
    """Test that the company identification flow works correctly.

    This test verifies the PROTECTED flow documented above parse_raw_data().
    Run this after any edits to ensure the flow wasn't broken.

    Returns True if all tests pass, False otherwise.
    """
    print("=" * 60)
    print("TESTING COMPANY IDENTIFICATION FLOW")
    print("=" * 60)

    all_passed = True

    # Test 1: Cache loading
    print("\n[1] Cache Loading...")
    if not _tracker_companies_cache:
        _load_tracker_companies_cache()
    if not _ref_companies_cache:
        try:
            load_reference_lists()
        except Exception:
            pass

    cache_size = len(_tracker_companies_cache)
    if cache_size > 0:
        print(f"    ✓ Tracker cache loaded: {cache_size} companies")
    else:
        print(f"    ✗ Tracker cache is EMPTY - this will break company matching!")
        all_passed = False

    # Test 2: Preposition detection
    print("\n[2] Preposition Detection...")
    preposition_tests = [
        ("Direktør at XMANO.", "XMANO.", True),
        ("Projektchef ved Søren Jensen", "Søren Jensen", True),
        ("Projektkoordinator hos Junkbusters", "Junkbusters", True),
        ("Technical Purchasing for VGP Denmark", "VGP Denmark", True),
        ("CEO | Fake Company", "Fake Company", False),  # Not preposition-based
    ]
    for title, extracted, expected in preposition_tests:
        result = _is_preposition_based_extraction(title, extracted)
        status = "✓" if result == expected else "✗"
        if result != expected:
            all_passed = False
        if verbose or result != expected:
            print(f"    {status} '{extracted}' from '{title[:30]}...' = {result} (expected {expected})")

    # Test 3: Title extraction
    print("\n[3] Title Extraction (parse_title_for_company)...")
    extraction_tests = [
        ("Direktør at XMANO.", "XMANO."),
        ("Projektchef ved Søren Jensen", "Søren Jensen"),
        ("Key Account Manager B2B Elgiganten A/S", "Elgiganten A/S"),
        ("Projektkoordinator hos Junkbusters", "Junkbusters"),
        ("Customer Success Manager @ Dalux", "Dalux"),
        ("Technical Purchasing for VGP Denmark", "VGP Denmark"),
    ]
    for title, expected in extraction_tests:
        result = parse_title_for_company(title)
        status = "✓" if result == expected else "✗"
        if result != expected:
            all_passed = False
        if verbose or result != expected:
            print(f"    {status} '{title[:40]}' → '{result}' (expected '{expected}')")

    # Test 4: Full parsing flow
    print("\n[4] Full Parsing Flow (parse_raw_data)...")
    test_data = """Executive profile: SH  Post hook: Test post  like
Henrik Rohlin
View Henrik Rohlin's profile 2nd degree connection
· 2nd
Direktør at XMANO.

like
Michael Nauta
View Michael Nauta's profile 2nd degree connection
· 2nd
Key Account Manager B2B Elgiganten A/S
"""
    results = parse_raw_data(test_data)

    expected_results = {
        "Henrik Rohlin": "XMANO.",
        "Michael Nauta": "Elgiganten A/S",
    }

    for r in results:
        name = r.get("Name", "")
        company = r.get("Company", "")
        expected = expected_results.get(name, "")
        status = "✓" if company == expected else "✗"
        if company != expected:
            all_passed = False
        if verbose or company != expected:
            print(f"    {status} {name}: '{company}' (expected '{expected}')")

    # Test 5: Blocklist keywords not extracted
    print("\n[5] False Positive Prevention...")
    false_positive_tests = [
        "Cirkularitet i byggeriet | Materialestationer | Donorbygninger",
        "Byggeledelse & Projektering",
    ]
    for title in false_positive_tests:
        extracted = parse_title_for_company(title)
        # These should either be None or require CVR/Brave validation
        if extracted and not _has_company_suffix(extracted):
            is_preposition = _is_preposition_based_extraction(title, extracted)
            if is_preposition:
                print(f"    ✗ '{extracted}' was accepted from '{title[:40]}' - should need validation!")
                all_passed = False
            else:
                print(f"    ✓ '{extracted}' extracted but would need CVR/Brave validation")
        else:
            print(f"    ✓ '{title[:40]}' → no false positive")

    # Summary
    print("\n" + "=" * 60)
    if all_passed:
        print("ALL TESTS PASSED ✓")
    else:
        print("SOME TESTS FAILED ✗ - Check output above")
    print("=" * 60)

    return all_passed


if __name__ == "__main__":
    main()