#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ ProductHunt数据抓取器 从tophub_data.db查询包含producthunt.com的链接,然后使用Playwright抓取产品信息并保存到product.db """ import sqlite3 import asyncio import os from datetime import datetime from loguru import logger from tqdm import tqdm import sys # 配置日志 logger.remove() logger.add(sys.stderr, level="INFO", format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}") class ProductHuntScraper: """ProductHunt数据抓取器""" def __init__(self): self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db") self.product_db_path = os.path.join(os.path.dirname(__file__), "product.db") self.product_urls = [] def query_producthunt_urls(self): """查询包含producthunt.com的链接""" logger.info("正在查询tophub_data.db数据库...") try: conn = sqlite3.connect(self.tophub_db_path) cursor = conn.cursor() # 查询包含producthunt.com的链接 cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'") urls = [row[0] for row in cursor.fetchall()] conn.close() logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接") return urls except Exception as e: logger.error(f"查询数据库失败: {e}") return [] def init_product_database(self): """初始化product.db数据库""" logger.info("正在初始化product.db数据库...") try: conn = sqlite3.connect(self.product_db_path) cursor = conn.cursor() # 创建产品信息表 cursor.execute(''' CREATE TABLE IF NOT EXISTS products ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT NOT NULL UNIQUE, name TEXT, introduction TEXT, user_count TEXT, maker_link TEXT, maker_statement TEXT, created_at TEXT NOT NULL, updated_at TEXT NOT NULL ) ''') conn.commit() conn.close() logger.success("product.db数据库初始化完成") except Exception as e: logger.error(f"初始化数据库失败: {e}") def check_duplicate(self, url): """检查URL是否已存在""" try: conn = sqlite3.connect(self.product_db_path) cursor = conn.cursor() cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,)) count = cursor.fetchone()[0] conn.close() return count > 0 except Exception as e: logger.error(f"检查重复失败: {e}") return False def save_product_info(self, product_info): """保存产品信息到数据库""" try: conn = sqlite3.connect(self.product_db_path) cursor = conn.cursor() current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 检查是否已存在 cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],)) existing = cursor.fetchone() if existing: # 更新现有记录 cursor.execute(''' UPDATE products SET name = ?, introduction = ?, user_count = ?, maker_link = ?, maker_statement = ?, updated_at = ? WHERE url = ? ''', ( product_info.get('name'), product_info.get('introduction'), product_info.get('user_count'), product_info.get('maker_link'), product_info.get('maker_statement'), current_time, product_info['url'] )) logger.info(f"更新产品信息: {product_info.get('name', '未知')}") else: # 插入新记录 cursor.execute(''' INSERT INTO products (url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?) ''', ( product_info['url'], product_info.get('name'), product_info.get('introduction'), product_info.get('user_count'), product_info.get('maker_link'), product_info.get('maker_statement'), current_time, current_time )) logger.info(f"新增产品信息: {product_info.get('name', '未知')}") conn.commit() conn.close() return True except Exception as e: logger.error(f"保存产品信息失败: {e}") return False async def scrape_product_info(self, url): """使用Playwright抓取产品信息""" try: # 导入Playwright相关模块 from playwright.async_api import async_playwright logger.info(f"开始抓取: {url}") # 创建Playwright实例 playwright = await async_playwright().start() browser = await playwright.chromium.launch(headless=True) page = await browser.new_page() # 设置超时时间 page.set_default_timeout(120000) # 增加超时时间以处理Cloudflare挑战 # 导航到页面 await page.goto(url, wait_until="domcontentloaded") # 检查是否是Cloudflare挑战页面 page_title = await page.title() if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title: logger.info("检测到Cloudflare挑战页面,等待验证完成...") # 等待Cloudflare挑战完成 try: # 等待页面标题变化或特定元素出现 await page.wait_for_function( """() => { const title = document.title; return !title.includes('请稍候') && !title.includes('Checking') && !title.includes('Verifying') && title !== '请稍候…'; }""", timeout=300000 # 5分钟 ) logger.info("Cloudflare挑战已完成") except Exception as e: logger.warning(f"等待Cloudflare挑战超时: {e}") # 等待页面加载 await page.wait_for_timeout(3000) product_info = {'url': url} # 提取产品名称 - 改进的XPath选择器 try: # 尝试多种选择器 name_selectors = [ "xpath=//h1", "xpath=//h1[@data-test='product-name']", "xpath=//h1[contains(@class, 'text')]", "xpath=//title" ] for selector in name_selectors: name_element = await page.query_selector(selector) if name_element: name_text = (await name_element.text_content()).strip() # 过滤掉页面标题中的无关内容 if name_text and 'Product Hunt' not in name_text and len(name_text) > 5: product_info['name'] = name_text logger.info(f"提取到产品名称: {product_info['name']}") break if 'name' not in product_info: logger.warning("未找到有效的产品名称元素") except Exception as e: logger.warning(f"提取产品名称失败: {e}") # 提取产品简介 - 改进的XPath选择器 try: intro_selectors = [ "xpath=//*[@class='relative text-16 font-normal text-gray-700']//div", "xpath=//p[contains(@class, 'description')]", "xpath=//div[contains(@class, 'description')]", "xpath=//meta[@name='description']" ] for selector in intro_selectors: intro_element = await page.query_selector(selector) if intro_element: intro_text = (await intro_element.text_content()).strip() if intro_text: product_info['introduction'] = intro_text logger.info(f"提取到产品简介: {product_info['introduction'][:100]}...") break if 'introduction' not in product_info: logger.warning("未找到产品简介元素") except Exception as e: logger.warning(f"提取产品简介失败: {e}") # 提取用户数 - 改进的XPath选择器 try: user_count_selectors = [ "xpath=//*[@class='flex flex-row gap-2']//div/div[2]/span/p", "xpath=//span[contains(text(), 'users')]", "xpath=//span[contains(text(), 'upvotes')]", "xpath=//div[contains(@class, 'stats')]" ] for selector in user_count_selectors: user_count_element = await page.query_selector(selector) if user_count_element: user_count_text = (await user_count_element.text_content()).strip() if user_count_text: product_info['user_count'] = user_count_text logger.info(f"提取到用户数: {product_info['user_count']}") break if 'user_count' not in product_info: logger.warning("未找到用户数元素") except Exception as e: logger.warning(f"提取用户数失败: {e}") # 提取制作人链接 - 改进的XPath选择器 try: maker_link_selectors = [ "xpath=//span[contains(@class, 'absolute')]", "xpath=//a[contains(@href, 'hunter')]", "xpath=//a[contains(text(), 'hunter')]", "xpath=//a[contains(@class, 'maker')]" ] for selector in maker_link_selectors: maker_element = await page.query_selector(selector) if maker_element: # 如果是span,找父级a标签 if 'span' in selector: a_element = await maker_element.evaluate_handle('(element) => element.closest("a")') if a_element: maker_link = await a_element.get_attribute('href') else: maker_link = await maker_element.get_attribute('href') if maker_link and not maker_link.startswith('http'): base_url = "https://www.producthunt.com" if maker_link.startswith('/'): maker_link = base_url + maker_link else: maker_link = base_url + '/' + maker_link if maker_link: product_info['maker_link'] = maker_link logger.info(f"提取到制作人链接: {maker_link}") break if 'maker_link' not in product_info: logger.warning("未找到制作人链接元素") except Exception as e: logger.warning(f"提取制作人链接失败: {e}") # 提取制作人发言(简化版本) try: if product_info.get('maker_link'): # 在新页面中打开制作人链接 new_page = await browser.new_page() await new_page.goto(product_info['maker_link'], wait_until="domcontentloaded") await new_page.wait_for_timeout(5000) # 尝试多种选择器提取发言内容 statement_selectors = [ "xpath=//*[@id='comment-4597755']/div/div[2]/div/div/div", "xpath=//div[contains(@class, 'comment')]", "xpath=//p[contains(@class, 'comment')]", "xpath=//article" ] for selector in statement_selectors: comment_element = await new_page.query_selector(selector) if comment_element: statement_text = (await comment_element.text_content()).strip() if statement_text and len(statement_text) > 10: product_info['maker_statement'] = statement_text logger.info(f"提取到制作人发言: {product_info['maker_statement'][:100]}...") break await new_page.close() else: logger.warning("没有制作人链接,跳过提取制作人发言") except Exception as e: logger.warning(f"提取制作人发言失败: {e}") # 关闭浏览器 await browser.close() await playwright.stop() logger.success(f"抓取完成: {product_info.get('name', '未知')}") return product_info except Exception as e: logger.error(f"抓取产品信息失败: {e}") return {'url': url} async def process_urls(self): """处理所有URL""" # 查询URL self.product_urls = self.query_producthunt_urls() if not self.product_urls: logger.warning("未找到包含producthunt.com的链接") return # 初始化数据库 self.init_product_database() logger.info(f"开始处理 {len(self.product_urls)} 个产品链接") # 创建进度条 with tqdm(total=len(self.product_urls), desc="处理进度") as pbar: for url in self.product_urls: try: # 检查是否已存在 if self.check_duplicate(url): logger.info(f"跳过已存在的链接: {url}") pbar.update(1) continue # 抓取产品信息 product_info = await self.scrape_product_info(url) # 保存到数据库 if product_info: self.save_product_info(product_info) pbar.update(1) except Exception as e: logger.error(f"处理链接失败 {url}: {e}") pbar.update(1) def run(self): """运行主程序""" logger.info("开始ProductHunt数据抓取任务") try: # 运行异步任务 asyncio.run(self.process_urls()) logger.success("任务完成") except Exception as e: logger.error(f"程序执行失败: {e}") def main(): """主函数""" scraper = ProductHuntScraper() scraper.run() if __name__ == "__main__": main()