#!/usr/bin/env python3
"""
ФИНАЛЬНЫЙ ПАРСЕР CITILINK
С полным списком брендов и исправлениями
"""

import requests
import mysql.connector
from bs4 import BeautifulSoup
import re
import time
import sys
from datetime import datetime

DB_CONFIG = {
    'host': 'localhost',
    'user': 'catalog_user',
    'password': 'Catalog2026',
    'database': 'cartridge_catalog'
}

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

def extract_manufacturer_improved(title):
    """Улучшенное определение производителя с полным списком брендов"""
    
    # ПОЛНЫЙ СПИСОК БРЕНДОВ КАРТРИДЖЕЙ
    brands = {
        # Основные
        'HP': ['hp', 'hewlett', 'hewlett-packard'],
        'Canon': ['canon'],
        'Brother': ['brother'],
        'Xerox': ['xerox'],
        'Samsung': ['samsung'],
        'Epson': ['epson'],
        'Kyocera': ['kyocera'],
        
        # Популярные
        'Pantum': ['pantum'],
        'OKI': ['oki'],
        'Lexmark': ['lexmark'],
        'Ricoh': ['ricoh'],
        'Konica': ['konica', 'konica minolta'],
        'Sharp': ['sharp'],
        'Toshiba': ['toshiba'],
        'Dell': ['dell'],
        'Lenovo': ['lenovo'],
        
        # Совместимые/альтернативные
        'Cactus': ['cactus'],
        'InkTek': ['inktek'],
        'ProfLine': ['profline'],
        'PrintRite': ['printrite'],
        'Альт': ['альт'],
        'PrintWorks': ['printworks'],
        'Briz': ['briz'],
        'ColorWay': ['colorway'],
        'Imazing': ['imazing'],
        'MyInk': ['myink'],
        
        # Струйные
        'Canon PG': ['canon pg'],
        'Canon CLI': ['canon cli'],
        'HP Ink': ['hp ink'],
        'Epson T': ['epson t'],
    }
    
    title_lower = title.lower()
    
    # 1. Проверяем точные совпадения
    for brand, keywords in brands.items():
        for keyword in keywords:
            if keyword in title_lower:
                return brand
    
    # 2. Ищем в начале названия (первое слово)
    words = title.split()
    if words:
        first_word = words[0].rstrip(':')
        # Если слово похоже на бренд
        if (len(first_word) > 2 and 
            first_word[0].isupper() and 
            not first_word[0].isdigit()):
            
            not_brands = ['Картридж', 'Комплект', 'Набор', 'Оригинальный', 
                         'Совместимый', 'Отзывы', 'Лазерный', 'Струйный']
            
            if first_word not in not_brands:
                return first_word
    
    # 3. Ищем слова полностью в верхнем регистре
    for word in title.split():
        if (len(word) > 2 and word.isupper() and 
            not any(char.isdigit() for char in word)):
            return word
    
    return "Другой"

def parse_citilink_full():
    print("=" * 60)
    print("🤖 ФИНАЛЬНЫЙ ПАРСЕР CITILINK")
    print("=" * 60)
    
    # Подключаемся с правильной кодировкой
    try:
        conn = mysql.connector.connect(**DB_CONFIG)
        cursor = conn.cursor()
        # Устанавливаем кодировку
        cursor.execute("SET NAMES utf8mb4")
        print("✅ Подключение к базе (UTF-8)")
    except Exception as e:
        print(f"❌ Ошибка подключения: {e}")
        return
    
    # Создаем сессию
    session = requests.Session()
    session.headers.update(HEADERS)
    
    # Загружаем каталог
    print("📥 Загружаем каталог картриджей...")
    try:
        response = session.get("https://www.citilink.ru/catalog/kartridzhi/", timeout=15)
        response.raise_for_status()
        response.encoding = 'utf-8'  # Явно указываем кодировку
    except Exception as e:
        print(f"❌ Ошибка: {e}")
        return
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Ищем товары - более точный поиск
    product_links = []
    
    # Ищем по разным селекторам
    selectors = [
        'a[href*="/product/"][href*="kartridzh"]',
        'a.ProductCardVertical__name',
        '.product-card a[href*="/product/"]'
    ]
    
    for selector in selectors:
        links = soup.select(selector)
        for link in links:
            href = link.get('href', '')
            if href:
                full_url = "https://www.citilink.ru" + href if href.startswith('/') else href
                if full_url not in product_links and 'kartridzh' in href.lower():
                    product_links.append(full_url)
    
    print(f"🔍 Найдено уникальных товаров: {len(product_links)}")
    
    # Ограничиваем
    if '--test' in sys.argv:
        product_links = product_links[:5]
        print("🧪 Тестовый режим: 5 товаров")
    else:
        product_links = product_links[:20]
    
    saved = 0
    skipped = 0
    
    for i, url in enumerate(product_links, 1):
        print(f"\n[{i}/{len(product_links)}] {url[:70]}...")
        time.sleep(1.2)  # Уважаем сервер
        
        try:
            # Проверяем не парсили ли уже этот товар
            cursor.execute("SELECT id FROM cartridges WHERE source_url = %s", (url,))
            if cursor.fetchone():
                print("   ⏭️ Уже в базе, пропускаем")
                skipped += 1
                continue
            
            # Загружаем товар
            resp = session.get(url, timeout=10)
            if resp.status_code != 200:
                print(f"   ⚠️ Ошибка HTTP {resp.status_code}")
                continue
            
            resp.encoding = 'utf-8'
            product_soup = BeautifulSoup(resp.text, 'html.parser')
            
            # Название (ищем разные варианты)
            title = "Картридж"
            title_selectors = ['h1', '.ProductHeader__title', '[itemprop="name"]']
            
            for selector in title_selectors:
                elem = product_soup.select_one(selector)
                if elem and elem.text.strip():
                    title = elem.text.strip()
                    break
            
            # Цена
            price = 0.0
            price_selectors = [
                '.ProductHeader__price',
                '[itemprop="price"]',
                '.ProductCardVertical__price-current-price'
            ]
            
            for selector in price_selectors:
                elem = product_soup.select_one(selector)
                if elem:
                    price_text = elem.text.strip()
                    numbers = re.findall(r'[\d\s]+[.,]?\d+', price_text.replace(' ', ''))
                    if numbers:
                        try:
                            price = float(numbers[0].replace(',', '.'))
                            break
                        except:
                            continue
            
            # Модель
            model = "Не указана"
            patterns = [
                r'([A-Z]{2,}\d+[A-Z]?)',
                r'([A-Z]+-\d+)',
                r'(\d{3,}[A-Z]?)',
                r'(MLT-[A-Z0-9]+)',
                r'(TN-[0-9]+)',
                r'(CF[0-9]+[A-Z]?)',
                r'(CLI-[0-9]+)',
                r'(PGI-[0-9]+)',
                r'(LC-[0-9]+)',
            ]
            
            for pattern in patterns:
                match = re.search(pattern, title)
                if match:
                    model = match.group(1)
                    break
            
            # Производитель (улучшенный)
            manufacturer = extract_manufacturer_improved(title)
            
            # Цвет
            color = "черный"
            title_lower = title.lower()
            color_map = {
                'черный': ['черный', 'черн', 'black', 'bk', 'k'],
                'голубой': ['голубой', 'голуб', 'cyan', 'c'],
                'желтый': ['желтый', 'желт', 'yellow', 'y'],
                'пурпурный': ['пурпурный', 'пурпур', 'magenta', 'm'],
                'красный': ['красный', 'красн', 'red', 'r'],
                'цветной': ['цветной', 'цветн', 'комплект', 'color']
            }
            
            for color_name, keywords in color_map.items():
                if any(keyword in title_lower for keyword in keywords):
                    color = color_name
                    break
            
            # Изображение
            image_url = ""
            img_selectors = ['img[itemprop="image"]', '.product-image img']
            for selector in img_selectors:
                img = product_soup.select_one(selector)
                if img and img.get('src'):
                    image_url = img['src']
                    if not image_url.startswith('http'):
                        image_url = "https://www.citilink.ru" + image_url
                    break
            
            # Сохраняем в базу
            cursor.execute("""
                INSERT INTO cartridges 
                (title, model, price, manufacturer, color, image_url, source_url)
                VALUES (%s, %s, %s, %s, %s, %s, %s)
            """, (title, model, price, manufacturer, color, image_url, url))
            
            saved += 1
            print(f"   ✅ {manufacturer} {model} - {price:,.0f} руб ({color})")
            print(f"      {title[:60]}...")
            
        except mysql.connector.Error as e:
            if "Duplicate entry" in str(e):
                print("   ⚠️ Дубликат, пропускаем")
                skipped += 1
            else:
                print(f"   ❌ Ошибка БД: {e}")
        except Exception as e:
            print(f"   ❌ Ошибка: {e}")
    
    # Фиксируем изменения
    conn.commit()
    
    # Логируем
    cursor.execute("""
        INSERT INTO parser_logs (parser_name, action, items_count, status, message)
        VALUES ('citilink_final', 'parse', %s, 'success', 'Финальный парсинг')
    """, (saved,))
    conn.commit()
    
    cursor.close()
    conn.close()
    
    # Итоги
    print(f"\n{'='*60}")
    print("📊 ИТОГИ:")
    print(f"   Обработано: {len(product_links)}")
    print(f"   Сохранено: {saved}")
    print(f"   Пропущено: {skipped}")
    print("=" * 60)
    
    # Показываем статистику по брендам
    conn = mysql.connector.connect(**DB_CONFIG)
    cursor = conn.cursor()
    cursor.execute("""
        SELECT manufacturer, COUNT(*) as count 
        FROM cartridges 
        GROUP BY manufacturer 
        ORDER BY count DESC, manufacturer
    """)
    brands = cursor.fetchall()
    cursor.close()
    conn.close()
    
    print("\n🏷️ СТАТИСТИКА ПО БРЕНДАМ:")
    for brand, count in brands:
        print(f"   • {brand}: {count} товаров")

if __name__ == "__main__":
    parse_citilink_full()