import requests
from bs4 import BeautifulSoup
import pandas as pd
import logging
from datetime import datetime
import time
import random
import sqlite3
from urllib.parse import urljoin
import os

class AmazonBookScraper:
    def __init__(self, partner_id="kilobitbe-21", base_path="/var/www/html/domains/boekenvlinder.be"):
        self.session = requests.Session()
        self.partner_id = partner_id
        self.base_path = base_path

        # Basis headers die een echte browser nabootsen
        self.base_headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Cache-Control': 'max-age=0'
        }

        # Realistische User-Agents
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        ]

        # Ensure base directory exists
        if not os.path.exists(self.base_path):
            os.makedirs(self.base_path, exist_ok=True)

        self.db_path = os.path.join(self.base_path, 'books.db')
        self.log_path = os.path.join(self.base_path, 'amazon_scraper.log')

        # Ensure log directory exists
        os.makedirs(os.path.dirname(self.log_path), exist_ok=True)

        # Setup logging
        logging.basicConfig(
            filename=self.log_path,
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9'
        }

        self.setup_database()

    def setup_database(self):
        """Initialize database tables"""
        conn = sqlite3.connect(self.db_path)
        c = conn.cursor()
        c.execute('''
            CREATE TABLE IF NOT EXISTS books (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                asin TEXT UNIQUE,
                title TEXT,
                author TEXT,
                current_price REAL,
                original_price REAL,
                discount_percentage REAL,
                url TEXT,
                language TEXT,
                image_url TEXT,
                last_updated TIMESTAMP
            )
        ''')
        conn.commit()
        conn.close()
        logging.info(f"Database initialized at {self.db_path}")

    def get_search_url(self, base_url, language):
        """Genereer de juiste zoek-URL per taal"""
        search_params = {
            'nl': {
                'keyword': 'boeken aanbieding',
                'category': '16384159031'
            },
            'fr': {
                'keyword': 'livres promotion',
                'category': '301061'
            },
            'en': {
                'keyword': 'book deals',
                'category': '266239'
            }
        }

        params = search_params.get(language, search_params['nl'])
        return f"{base_url}/s?k={params['keyword'].replace(' ', '+')}&rh=n%3A{params['category']}&dc"

    def get_with_retry(self, url, max_retries=3):
        """Voer request uit met retry mechanisme"""
        for attempt in range(max_retries):
            try:
                # Update headers voor elke request
                headers = self.base_headers.copy()
                headers['User-Agent'] = random.choice(self.user_agents)

                # Voeg realistische delay toe
                time.sleep(random.uniform(2, 5))

                response = self.session.get(
                    url,
                    headers=headers,
                    timeout=10
                )
                response.raise_for_status()
                return response

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 503 and attempt < max_retries - 1:
                    delay = (attempt + 1) * 5
                    logging.warning(f"503 error, retrying in {delay} seconds...")
                    time.sleep(delay)
                    continue
                raise
            except Exception as e:
                if attempt < max_retries - 1:
                    logging.warning(f"Request failed: {str(e)}, retrying...")
                    time.sleep(random.uniform(1, 3))
                    continue
                raise

    def scrape_books(self, language='nl'):
        """Scrape discounted books for a specific language"""
        base_urls = {
            'nl': 'https://www.amazon.nl',
            'fr': 'https://www.amazon.fr',
            'en': 'https://www.amazon.co.uk'
        }

        if language not in base_urls:
            logging.error(f"Unsupported language: {language}")
            return

        base_url = base_urls[language]
        search_url = self.get_search_url(base_url, language)

        try:
            logging.info(f"Starting scrape for {language} from {search_url}")
            response = self.get_with_retry(search_url)

            # Debug response
            with open(f'amazon_response_{language}.html', 'w', encoding='utf-8') as f:
                f.write(response.text)

            soup = BeautifulSoup(response.text, 'html.parser')
            books = []

            # Update headers with random User-Agent
            self.headers['User-Agent'] = random.choice(self.user_agents)

            for item in soup.find_all('div', {'data-component-type': 's-search-result'}):
                try:
                    # Extract book information more robustly
                    title_elem = (
                        item.find('span', {'class': 'a-text-normal'}) or
                        item.find('h2', {'class': 'a-size-mini'}) or
                        item.find('span', {'class': 'a-size-medium'})
                    )
                    if not title_elem:
                        logging.warning(f"Could not find title element, skipping item")
                        continue

                    title = title_elem.text.strip()

                    author_elem = (
                        item.find('div', {'class': 'a-row a-size-base a-color-secondary'}) or
                        item.find('div', {'class': 'a-row'})
                    )
                    author = author_elem.text.strip() if author_elem else "Unknown"

                    # More robust price extraction
                    price_elem = (
                        item.find('span', {'class': 'a-offscreen'}) or
                        item.find('span', {'class': 'a-price-whole'})
                    )

                    try:
                        if price_elem:
                            price_text = price_elem.text.strip()
                            current_price = float(''.join(filter(lambda x: x.isdigit() or x == ',', price_text)).replace(',', '.'))
                        else:
                            current_price = None
                    except (ValueError, AttributeError) as e:
                        logging.warning(f"Error parsing current price: {e}")
                        current_price = None

                    original_price_elem = (
                        item.find('span', {'class': 'a-text-price'}) or
                        item.find('span', {'class': 'a-price a-text-price'})
                    )

                    try:
                        if original_price_elem:
                            price_text = original_price_elem.text.strip()
                            original_price = float(''.join(filter(lambda x: x.isdigit() or x == ',', price_text)).replace(',', '.'))
                        else:
                            original_price = current_price
                    except (ValueError, AttributeError) as e:
                        logging.warning(f"Error parsing original price: {e}")
                        original_price = current_price

                    url = urljoin(base_url, item.find('a', {'class': 'a-link-normal'})['href'])
                    url = self.modify_url_with_partner_id(url)

                    asin = item['data-asin']

                    image_url = item.find('img', {'class': 's-image'})['src']

                    discount = ((original_price - current_price) / original_price * 100) if original_price and current_price else 0

                    books.append({
                        'asin': asin,
                        'title': title,
                        'author': author,
                        'current_price': current_price,
                        'original_price': original_price,
                        'discount_percentage': discount,
                        'url': url,
                        'language': language,
                        'image_url': image_url,
                        'last_updated': datetime.now()
                    })

                    logging.info(f"Successfully scraped book: {title}")

                except Exception as e:
                    logging.error(f"Error processing book item: {str(e)}")
                    continue

            self.save_to_database(books)
            logging.info(f"Successfully scraped {len(books)} books for language: {language}")

        except Exception as e:
            logging.error(f"Error scraping {language} books: {str(e)}")

    def save_to_database(self, books):
        """Save scraped books to SQLite database"""
        conn = sqlite3.connect(self.db_path)
        c = conn.cursor()

        for book in books:
            try:
                c.execute('''
                    INSERT OR REPLACE INTO books
                    (asin, title, author, current_price, original_price,
                     discount_percentage, url, language, image_url, last_updated)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    book['asin'], book['title'], book['author'],
                    book['current_price'], book['original_price'],
                    book['discount_percentage'], book['url'],
                    book['language'], book['image_url'],
                    book['last_updated']
                ))
            except Exception as e:
                logging.error(f"Error saving book to database: {str(e)}")
                continue

        conn.commit()
        conn.close()

if __name__ == "__main__":
    scraper = AmazonBookScraper()

    for language in ['nl', 'fr', 'en']:
        logging.info(f"Starting scrape for language: {language}")
        scraper.scrape_books(language)
        # Random delay between languages to avoid rate limiting
        time.sleep(random.uniform(2, 5))
