import pandas as pd
import os

INPUT_FILE = 'wp-content/plugins/zrm-generalizer/generalized_jobtitles.xlsx'
OUTPUT_FILE = 'wp-content/plugins/zrm-generalizer/analyzed_jobtitles.xlsx'

def analyze():
    print(f"Reading {INPUT_FILE}...")
    try:
        df = pd.read_excel(INPUT_FILE)
    except Exception as e:
        print(f"Error reading file: {e}")
        return

    # Dictionary to store generalized_title -> set of original_titles
    # Using set to deduplicate original titles for the *same* generalized title
    # (i.e. if "Dev" -> "Dev" and "Dev" -> "Dev", we list it once? 
    # Or count total frequency? User said "combine the original titles".
    # I'll store all original titles in a list to show frequency, or unique?
    # Usually lists of originals are better unique-ified per row, but cumulative across rows.
    # Let's store a set of original titles to avoid massive strings if many duplicates,
    # but keep a separate counter for total occurrences.
    
    # Actually, if many job ads have the same title, we probably want to know that.
    # But "combine the original titles" implies a list.
    # If 1000 rows are "Nurse", and generalized is "Nurse", listing "Nurse" 1000 times is useless.
    # So I will store UNIQUE original titles, but keep the COUNT of total matches.
    
    gen_map = {}
    
    total_rows = len(df)
    print(f"Processing {total_rows} rows...")
    
    title_columns = [f'title_{i}' for i in range(1, 11)]
    
    for index, row in df.iterrows():
        original = str(row['original_title']).strip()
        
        # Get all generalized titles from the 10 columns
        # Filter out NaNs and empty strings
        g_titles = set()
        for col in title_columns:
            val = row.get(col)
            if pd.notna(val):
                s_val = str(val).strip()
                if s_val:
                    g_titles.add(s_val)
        
        # Add to map
        for gt in g_titles:
            if gt not in gen_map:
                gen_map[gt] = {
                    'count': 0,
                    'originals': set()
                }
            gen_map[gt]['count'] += 1
            gen_map[gt]['originals'].add(original)
            
    print(f"Found {len(gen_map)} unique generalized titles.")
    
    # Convert to list for DataFrame
    output_data = []
    for gt, data in gen_map.items():
        # Join originals with ' | ' or similar
        # Sort originals alphabetically for consistency
        originals_list = sorted(list(data['originals']))
        originals_str = " | ".join(originals_list)
        
        output_data.append({
            'Generalized Title': gt,
            'Count': data['count'],
            'Unique Original Titles Count': len(originals_list),
            'Original Titles': originals_str
        })
        
    # Create DataFrame
    out_df = pd.read_json(pd.Series(output_data).to_json(orient='values')) 
    # Direct DataFrame creation is safer usually
    out_df = pd.DataFrame(output_data)
    
    # Sort by Count Descending
    out_df = out_df.sort_values(by='Count', ascending=False)
    
    print(f"Writing to {OUTPUT_FILE}...")
    out_df.to_excel(OUTPUT_FILE, index=False)
    print("Done.")

if __name__ == "__main__":
    analyze()