#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 高级ProductHunt抓取器 - 处理Cloudflare Turnstile挑战 """ import asyncio import sqlite3 from loguru import logger import os from urllib.parse import urlparse class AdvancedProductHuntScraper: def __init__(self, db_path="test_product.db"): self.db_path = db_path self.init_database() def init_database(self): """初始化数据库""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # 创建products表 cursor.execute(""" CREATE TABLE IF NOT EXISTS products ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, url TEXT UNIQUE, introduction TEXT, user_count INTEGER, maker_link TEXT, maker_statement TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) conn.commit() conn.close() logger.info(f"数据库已初始化: {self.db_path}") def check_duplicate(self, url): """检查URL是否已存在""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute("SELECT id FROM products WHERE url = ?", (url,)) result = cursor.fetchone() conn.close() return result is not None def save_product_info(self, product_info): """保存产品信息到数据库""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # 检查是否已存在 cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],)) existing = cursor.fetchone() if existing: # 更新现有记录 cursor.execute(""" UPDATE products SET name = ?, introduction = ?, user_count = ?, maker_link = ?, maker_statement = ?, updated_at = CURRENT_TIMESTAMP WHERE url = ? """, ( product_info['name'], product_info['introduction'], product_info['user_count'], product_info['maker_link'], product_info['maker_statement'], product_info['url'] )) logger.info(f"更新产品信息: {product_info['name']}") else: # 插入新记录 cursor.execute(""" INSERT INTO products (name, url, introduction, user_count, maker_link, maker_statement) VALUES (?, ?, ?, ?, ?, ?) """, ( product_info['name'], product_info['url'], product_info['introduction'], product_info['user_count'], product_info['maker_link'], product_info['maker_statement'] )) logger.info(f"保存产品信息: {product_info['name']}") conn.commit() conn.close() async def scrape_with_stealth(self, url): """使用隐身模式抓取产品信息""" try: from playwright.async_api import async_playwright logger.info(f"开始高级抓取: {url}") # 创建Playwright实例 playwright = await async_playwright().start() # 使用更隐蔽的浏览器配置 browser = await playwright.chromium.launch( headless=False, # 非无头模式以便观察 args=[ '--disable-blink-features=AutomationControlled', '--disable-features=VizDisplayCompositor', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding', '--disable-web-security', '--disable-features=TranslateUI', '--disable-ipc-flooding-protection', '--no-sandbox', '--disable-setuid-sandbox' ] ) # 创建上下文和页面 context = await browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', extra_http_headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } ) page = await context.new_page() # 隐藏自动化特征 await page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined, }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5], }); Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'], }); """) # 设置超时时间 page.set_default_timeout(300000) # 5分钟 # 导航到页面 await page.goto(url, wait_until="domcontentloaded") # 检查页面状态 page_title = await page.title() logger.info(f"页面标题: {page_title}") # 检查是否是Cloudflare挑战页面 if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title: logger.info("检测到Cloudflare挑战页面,等待用户手动验证...") # 等待用户手动完成验证 try: # 等待页面标题变化或特定元素出现 await page.wait_for_function( """() => { const title = document.title; return !title.includes('请稍候') && !title.includes('Checking') && !title.includes('Verifying') && title !== '请稍候…'; }""", timeout=300000 # 5分钟 ) logger.info("Cloudflare挑战已完成") except Exception as e: logger.warning(f"等待Cloudflare挑战超时: {e}") # 如果超时,尝试刷新页面 await page.reload(wait_until="domcontentloaded") logger.info("已刷新页面") # 等待页面加载 await page.wait_for_timeout(5000) # 获取当前页面URL current_url = page.url logger.info(f"当前页面URL: {current_url}") # 检查是否重定向到其他页面 if current_url != url: logger.warning(f"页面已重定向: {url} -> {current_url}") # 尝试提取产品信息 product_info = {'url': url} # 提取产品名称 name_selectors = [ "h1", "[data-test='product-name']", ".product-name", "title" ] for selector in name_selectors: try: element = await page.query_selector(selector) if element: name = await element.text_content() if name and name.strip() and name.strip() != "www.producthunt.com": product_info['name'] = name.strip() logger.info(f"提取到产品名称: {product_info['name']}") break except Exception as e: logger.debug(f"选择器 {selector} 失败: {e}") if 'name' not in product_info: # 从URL中提取产品名称 parsed_url = urlparse(url) path_parts = parsed_url.path.split('/') if len(path_parts) >= 3 and path_parts[-2] == 'products': product_info['name'] = path_parts[-1].replace('-', ' ').title() logger.info(f"从URL提取产品名称: {product_info['name']}") else: product_info['name'] = "Unknown Product" logger.warning("无法提取产品名称") # 提取其他信息(简化版本) product_info['introduction'] = None product_info['user_count'] = None product_info['maker_link'] = None product_info['maker_statement'] = None # 关闭浏览器 await browser.close() await playwright.stop() logger.success(f"抓取完成: {product_info['name']}") return product_info except Exception as e: logger.error(f"抓取失败: {e}") return {'url': url, 'name': 'Error', 'introduction': None, 'user_count': None, 'maker_link': None, 'maker_statement': None} async def run_test(self): """运行测试""" # 从tophub_data.db获取ProductHunt链接 tophub_db_path = os.path.join(os.path.dirname(self.db_path), "..", "tophub_data.db") conn = sqlite3.connect(tophub_db_path) cursor = conn.cursor() # 查询包含producthunt.com的链接 cursor.execute(""" SELECT url FROM articles WHERE url LIKE '%producthunt.com%' LIMIT 3 """) urls = [row[0] for row in cursor.fetchall()] conn.close() logger.info(f"找到 {len(urls)} 个ProductHunt链接") # 处理每个URL for url in urls: logger.info(f"处理URL: {url}") # 检查是否重复(注释掉跳过逻辑以强制重新抓取) # if self.check_duplicate(url): # logger.info(f"链接已存在,跳过: {url}") # continue # 抓取产品信息 product_info = await self.scrape_with_stealth(url) # 保存到数据库 self.save_product_info(product_info) # 统计结果 conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute("SELECT COUNT(*) FROM products") count = cursor.fetchone()[0] cursor.execute("SELECT name, url FROM products") products = cursor.fetchall() conn.close() logger.success("测试任务完成") print("\n=== 测试结果统计 ===") print(f"数据库中的产品数量: {count}") print("已抓取的产品:") for name, url in products: print(f" - {name}: {url}") async def main(): """主函数""" # 配置日志 logger.remove() logger.add( "advanced_scraper.log", level="DEBUG", format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{function}:{line} - {message}", rotation="10 MB", retention="7 days" ) # 创建抓取器实例 scraper = AdvancedProductHuntScraper() # 运行测试 await scraper.run_test() if __name__ == "__main__": asyncio.run(main())