#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import json
import logging
import re
import sys
import time
import traceback
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Set, Tuple

import requests
from requests.exceptions import RequestException

try:
    from facebook_scraper import get_posts, set_cookies

    LibraryError = None
    try:
        from facebook_scraper.exceptions import FacebookScraperError as LibraryError
    except ImportError:
        try:
            from facebook_scraper.exceptions import FacebookError as LibraryError
        except ImportError:
            logging.warning("Libreria 'facebook-scraper' molto vecchia. Si consiglia l'aggiornamento.")

except ImportError as e:
    logging.error(f"ERRORE DI IMPORTAZIONE: {e}")
    if "lxml.html.clean" in str(e):
        print(">> Per risolvere, esegui: pip install lxml_html_clean")
    sys.exit(1)

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
REQUIRED_COOKIES = {'c_user', 'xs'}

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj: Any) -> Any:
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

def sanitize_filename(filename: str) -> str:
    return re.sub(r'[\\/*?:"<>|]', '_', filename)

def download_media(url: str, save_path: Path) -> bool:
    try:
        headers = {
            'User-Agent': USER_AGENT,
            'Accept': 'image/webp,image/*,*/*;q=0.8',
            'Referer': 'https://www.facebook.com/',
        }
        with requests.get(url, headers=headers, stream=True, timeout=60) as response:
            response.raise_for_status()
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
        logging.info(f"Media salvato: {save_path.name}")
        return True
    except RequestException as e:
        logging.error(f"Errore durante il download di [{url}]: {e}")
        return False

def validate_cookies(cookie_file: Path) -> Tuple[bool, Set[str]]:
    found_cookies = set()
    try:
        with open(cookie_file, 'r') as f:
            for line in f:
                if line.startswith('#') or not line.strip():
                    continue
                parts = line.strip().split('\t')
                if len(parts) >= 6:
                    cookie_name = parts[5]
                    if cookie_name in REQUIRED_COOKIES:
                        found_cookies.add(cookie_name)
    except FileNotFoundError:
        logging.error(f"File cookie non trovato: {cookie_file}")
        return False, REQUIRED_COOKIES

    missing = REQUIRED_COOKIES - found_cookies
    return not missing, missing

def print_cookie_help():
    guide = """
============================================================
 MANUALE PER OTTENERE I COOKIE DI FACEBOOK
============================================================
Per eseguire lo scraping in modalità autenticata (necessario
per la maggior parte delle pagine e per dati più completi),
è indispensabile fornire un file di cookie valido.

1. Installa un'estensione per esportare i cookie (es. Get cookies.txt LOCALLY).
2. Accedi a Facebook nel tuo browser.
3. Clicca sull'icona dell'estensione e esporta il file come "cookies.txt".
4. Assicurati che il file contenga 'c_user' e 'xs'.
5. Esegui lo script con: --cookies cookies.txt
============================================================
"""
    print(guide)

def main():
    parser = argparse.ArgumentParser(
        description='Scarica post da una pagina Facebook, inclusi media e dati.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument('page', help='ID o nome della pagina Facebook da analizzare.')
    parser.add_argument('-o', '--output', default='fb_data', help='Cartella principale di output.')
    parser.add_argument('-p', '--pages', type=int, default=5, help='Numero di "pagine" di post da scaricare.')
    parser.add_argument('-c', '--cookies', help='Percorso del file cookies.txt per l\'accesso autenticato.')
    parser.add_argument('-d', '--delay', type=float, default=10.0, help='Ritardo in secondi tra le richieste.')
    args = parser.parse_args()

    if args.cookies:
        cookie_path = Path(args.cookies)
        if not cookie_path.is_file():
            logging.error(f"File cookie '{args.cookies}' non trovato!")
            print_cookie_help()
            sys.exit(1)

        is_valid, missing = validate_cookies(cookie_path)
        if not is_valid:
            logging.error(f"ERRORE GRAVE: Cookie essenziali mancanti: {', '.join(missing)}")
            print_cookie_help()
            sys.exit(1)
        
        # --- ULTIMA CORREZIONE QUI ---
        # Chiamata posizionale per versioni molto vecchie della libreria
        set_cookies(str(cookie_path))
        logging.info("Cookie validi caricati. Lo scraping verrà eseguito in modalità autenticata.")
    else:
        logging.warning("Nessun file di cookie fornito. Modalità non autenticata.")

    base_dir = Path(args.output) / sanitize_filename(args.page)
    posts_dir = base_dir / 'posts'
    base_dir.mkdir(parents=True, exist_ok=True)
    posts_dir.mkdir(exist_ok=True)

    all_posts_summary = []
    start_time = time.time()

    logging.info(f"INIZIO SCRAPING dalla pagina: {args.page}")
    logging.info(f"I dati verranno salvati in: {base_dir}")

    try:
        scraper_options = {
            "reactions": True, "comments": True, "allow_extra_requests": True,
            "posts_per_page": 5,
        }
        post_iterator = get_posts(args.page, pages=args.pages, options=scraper_options, extra_info=True)

        for i, post in enumerate(post_iterator):
            post_id = post.get('post_id', f'no_id_{int(time.time())}_{i}')
            logging.info(f"--- Elaborazione Post #{i+1} (ID: {post_id}) ---")
            post_folder = posts_dir / f"post_{post_id}"
            post_folder.mkdir(exist_ok=True)

            with open(post_folder / 'raw_data.json', 'w', encoding='utf-8') as f:
                json.dump(post, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder)

            images_to_download = post.get('images', [])
            if not images_to_download and post.get('image'):
                images_to_download.append(post.get('image'))

            if images_to_download:
                images_dir = post_folder / 'images'
                images_dir.mkdir(exist_ok=True)
                for j, img_url in enumerate(images_to_download):
                    download_media(img_url, images_dir / f'image_{j}.jpg')

            if post.get('video'):
                video_dir = post_folder / 'videos'
                video_dir.mkdir(exist_ok=True)
                download_media(post['video'], video_dir / f"video_{post.get('video_id', 'id')}.mp4")

            summary = {
                'id': post_id,
                'timestamp': post.get('time').isoformat() if post.get('time') else None,
                'text': post.get('text'), 'likes': post.get('likes'),
                'comments': post.get('comments'), 'shares': post.get('shares'),
                'url': post.get('post_url'), 'scraped_at': datetime.now().isoformat(),
            }
            all_posts_summary.append(summary)
            logging.info(f"Attesa di {args.delay} secondi...")
            time.sleep(args.delay)

    except KeyboardInterrupt:
        logging.warning("Interruzione manuale rilevata. Salvataggio in corso...")
    except Exception as e:
        if LibraryError and isinstance(e, LibraryError):
            logging.critical(f"ERRORE CRITICO DI FACEBOOK: {e}")
        else:
            logging.error(f"ERRORE IMPREVISTO: {e.__class__.__name__}: {e}")
        traceback.print_exc()

    finally:
        if all_posts_summary:
            report_path = base_dir / 'report_summary.json'
            logging.info(f"Salvataggio del report finale in: {report_path}")
            with open(report_path, 'w', encoding='utf-8') as f:
                json.dump(all_posts_summary, f, ensure_ascii=False, indent=2)

        elapsed_time = time.time() - start_time
        logging.info("=" * 50)
        logging.info("OPERAZIONE COMPLETATA")
        logging.info(f"Post totali processati: {len(all_posts_summary)}")
        logging.info(f"Tempo totale: {elapsed_time / 60:.2f} minuti")
        if all_posts_summary:
            logging.info(f"Report salvato in: {report_path}")

if __name__ == "__main__":
    main()