增加抓取producthunt的数据

2025-11-23 11:15:45 +08:00
parent ee308c6d6f
commit 9088939701
15 changed files with 1855 additions and 181 deletions
--- a/product/pycache/playwright-get-data.cpython-313.pyc
+++ b/product/pycache/playwright-get-data.cpython-313.pyc
--- a/product/advanced_scraper.py
+++ b/product/advanced_scraper.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+高级ProductHunt抓取器 - 处理Cloudflare Turnstile挑战
+"""
+
+import asyncio
+import sqlite3
+from loguru import logger
+import os
+from urllib.parse import urlparse
+
+class AdvancedProductHuntScraper:
+    def __init__(self, db_path="test_product.db"):
+        self.db_path = db_path
+        self.init_database()
+    
+    def init_database(self):
+        """初始化数据库"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        # 创建products表
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS products (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                name TEXT,
+                url TEXT UNIQUE,
+                introduction TEXT,
+                user_count INTEGER,
+                maker_link TEXT,
+                maker_statement TEXT,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        """)
+        
+        conn.commit()
+        conn.close()
+        logger.info(f"数据库已初始化: {self.db_path}")
+    
+    def check_duplicate(self, url):
+        """检查URL是否已存在"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT id FROM products WHERE url = ?", (url,))
+        result = cursor.fetchone()
+        conn.close()
+        return result is not None
+    
+    def save_product_info(self, product_info):
+        """保存产品信息到数据库"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        # 检查是否已存在
+        cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
+        existing = cursor.fetchone()
+        
+        if existing:
+            # 更新现有记录
+            cursor.execute("""
+                UPDATE products SET 
+                    name = ?, introduction = ?, user_count = ?, 
+                    maker_link = ?, maker_statement = ?, updated_at = CURRENT_TIMESTAMP
+                WHERE url = ?
+            """, (
+                product_info['name'], product_info['introduction'], 
+                product_info['user_count'], product_info['maker_link'], 
+                product_info['maker_statement'], product_info['url']
+            ))
+            logger.info(f"更新产品信息: {product_info['name']}")
+        else:
+            # 插入新记录
+            cursor.execute("""
+                INSERT INTO products (name, url, introduction, user_count, maker_link, maker_statement)
+                VALUES (?, ?, ?, ?, ?, ?)
+            """, (
+                product_info['name'], product_info['url'], product_info['introduction'],
+                product_info['user_count'], product_info['maker_link'], product_info['maker_statement']
+            ))
+            logger.info(f"保存产品信息: {product_info['name']}")
+        
+        conn.commit()
+        conn.close()
+    
+    async def scrape_with_stealth(self, url):
+        """使用隐身模式抓取产品信息"""
+        try:
+            from playwright.async_api import async_playwright
+            
+            logger.info(f"开始高级抓取: {url}")
+            
+            # 创建Playwright实例
+            playwright = await async_playwright().start()
+            
+            # 使用更隐蔽的浏览器配置
+            browser = await playwright.chromium.launch(
+                headless=False,  # 非无头模式以便观察
+                args=[
+                    '--disable-blink-features=AutomationControlled',
+                    '--disable-features=VizDisplayCompositor',
+                    '--disable-background-timer-throttling',
+                    '--disable-backgrounding-occluded-windows',
+                    '--disable-renderer-backgrounding',
+                    '--disable-web-security',
+                    '--disable-features=TranslateUI',
+                    '--disable-ipc-flooding-protection',
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox'
+                ]
+            )
+            
+            # 创建上下文和页面
+            context = await browser.new_context(
+                viewport={'width': 1920, 'height': 1080},
+                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                extra_http_headers={
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+                    'Accept-Language': 'en-US,en;q=0.9',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'DNT': '1',
+                    'Connection': 'keep-alive',
+                    'Upgrade-Insecure-Requests': '1',
+                }
+            )
+            
+            page = await context.new_page()
+            
+            # 隐藏自动化特征
+            await page.add_init_script("""
+                Object.defineProperty(navigator, 'webdriver', {
+                    get: () => undefined,
+                });
+                Object.defineProperty(navigator, 'plugins', {
+                    get: () => [1, 2, 3, 4, 5],
+                });
+                Object.defineProperty(navigator, 'languages', {
+                    get: () => ['en-US', 'en'],
+                });
+            """)
+            
+            # 设置超时时间
+            page.set_default_timeout(300000)  # 5分钟
+            
+            # 导航到页面
+            await page.goto(url, wait_until="domcontentloaded")
+            
+            # 检查页面状态
+            page_title = await page.title()
+            logger.info(f"页面标题: {page_title}")
+            
+            # 检查是否是Cloudflare挑战页面
+            if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
+                logger.info("检测到Cloudflare挑战页面，等待用户手动验证...")
+                
+                # 等待用户手动完成验证
+                try:
+                    # 等待页面标题变化或特定元素出现
+                    await page.wait_for_function(
+                        """() => {
+                            const title = document.title;
+                            return !title.includes('请稍候') && 
+                                   !title.includes('Checking') && 
+                                   !title.includes('Verifying') &&
+                                   title !== '请稍候…';
+                        }""",
+                        timeout=300000  # 5分钟
+                    )
+                    logger.info("Cloudflare挑战已完成")
+                except Exception as e:
+                    logger.warning(f"等待Cloudflare挑战超时: {e}")
+                    
+                    # 如果超时，尝试刷新页面
+                    await page.reload(wait_until="domcontentloaded")
+                    logger.info("已刷新页面")
+            
+            # 等待页面加载
+            await page.wait_for_timeout(5000)
+            
+            # 获取当前页面URL
+            current_url = page.url
+            logger.info(f"当前页面URL: {current_url}")
+            
+            # 检查是否重定向到其他页面
+            if current_url != url:
+                logger.warning(f"页面已重定向: {url} -> {current_url}")
+            
+            # 尝试提取产品信息
+            product_info = {'url': url}
+            
+            # 提取产品名称
+            name_selectors = [
+                "h1",
+                "[data-test='product-name']",
+                ".product-name",
+                "title"
+            ]
+            
+            for selector in name_selectors:
+                try:
+                    element = await page.query_selector(selector)
+                    if element:
+                        name = await element.text_content()
+                        if name and name.strip() and name.strip() != "www.producthunt.com":
+                            product_info['name'] = name.strip()
+                            logger.info(f"提取到产品名称: {product_info['name']}")
+                            break
+                except Exception as e:
+                    logger.debug(f"选择器 {selector} 失败: {e}")
+            
+            if 'name' not in product_info:
+                # 从URL中提取产品名称
+                parsed_url = urlparse(url)
+                path_parts = parsed_url.path.split('/')
+                if len(path_parts) >= 3 and path_parts[-2] == 'products':
+                    product_info['name'] = path_parts[-1].replace('-', ' ').title()
+                    logger.info(f"从URL提取产品名称: {product_info['name']}")
+                else:
+                    product_info['name'] = "Unknown Product"
+                    logger.warning("无法提取产品名称")
+            
+            # 提取其他信息（简化版本）
+            product_info['introduction'] = None
+            product_info['user_count'] = None
+            product_info['maker_link'] = None
+            product_info['maker_statement'] = None
+            
+            # 关闭浏览器
+            await browser.close()
+            await playwright.stop()
+            
+            logger.success(f"抓取完成: {product_info['name']}")
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"抓取失败: {e}")
+            return {'url': url, 'name': 'Error', 'introduction': None, 'user_count': None, 'maker_link': None, 'maker_statement': None}
+    
+    async def run_test(self):
+        """运行测试"""
+        # 从tophub_data.db获取ProductHunt链接
+        tophub_db_path = os.path.join(os.path.dirname(self.db_path), "..", "tophub_data.db")
+        
+        conn = sqlite3.connect(tophub_db_path)
+        cursor = conn.cursor()
+        
+        # 查询包含producthunt.com的链接
+        cursor.execute("""
+            SELECT url FROM articles 
+            WHERE url LIKE '%producthunt.com%' 
+            LIMIT 3
+        """)
+        
+        urls = [row[0] for row in cursor.fetchall()]
+        conn.close()
+        
+        logger.info(f"找到 {len(urls)} 个ProductHunt链接")
+        
+        # 处理每个URL
+        for url in urls:
+            logger.info(f"处理URL: {url}")
+            
+            # 检查是否重复（注释掉跳过逻辑以强制重新抓取）
+            # if self.check_duplicate(url):
+            #     logger.info(f"链接已存在，跳过: {url}")
+            #     continue
+            
+            # 抓取产品信息
+            product_info = await self.scrape_with_stealth(url)
+            
+            # 保存到数据库
+            self.save_product_info(product_info)
+        
+        # 统计结果
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT COUNT(*) FROM products")
+        count = cursor.fetchone()[0]
+        
+        cursor.execute("SELECT name, url FROM products")
+        products = cursor.fetchall()
+        conn.close()
+        
+        logger.success("测试任务完成")
+        
+        print("\n=== 测试结果统计 ===")
+        print(f"数据库中的产品数量: {count}")
+        print("已抓取的产品:")
+        for name, url in products:
+            print(f"  - {name}: {url}")
+
+async def main():
+    """主函数"""
+    # 配置日志
+    logger.remove()
+    logger.add(
+        "advanced_scraper.log",
+        level="DEBUG",
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{function}:{line} - {message}",
+        rotation="10 MB",
+        retention="7 days"
+    )
+    
+    # 创建抓取器实例
+    scraper = AdvancedProductHuntScraper()
+    
+    # 运行测试
+    await scraper.run_test()
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/product/api_scraper.py
+++ b/product/api_scraper.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ProductHunt API抓取器 - 通过API获取产品信息
+"""
+
+import asyncio
+import sqlite3
+import requests
+from loguru import logger
+import os
+import json
+from urllib.parse import urlparse
+
+class ProductHuntAPIScraper:
+    def __init__(self, db_path="test_product.db"):
+        self.db_path = db_path
+        self.init_database()
+    
+    def init_database(self):
+        """初始化数据库"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        # 创建products表
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS products (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                name TEXT,
+                url TEXT UNIQUE,
+                introduction TEXT,
+                user_count INTEGER,
+                maker_link TEXT,
+                maker_statement TEXT,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        """)
+        
+        conn.commit()
+        conn.close()
+        logger.info(f"数据库已初始化: {self.db_path}")
+    
+    def save_product_info(self, product_info):
+        """保存产品信息到数据库"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        # 检查是否已存在
+        cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
+        existing = cursor.fetchone()
+        
+        if existing:
+            # 更新现有记录
+            cursor.execute("""
+                UPDATE products SET 
+                    name = ?, introduction = ?, user_count = ?, 
+                    maker_link = ?, maker_statement = ?, updated_at = CURRENT_TIMESTAMP
+                WHERE url = ?
+            """, (
+                product_info['name'], product_info['introduction'], 
+                product_info['user_count'], product_info['maker_link'], 
+                product_info['maker_statement'], product_info['url']
+            ))
+            logger.info(f"更新产品信息: {product_info['name']}")
+        else:
+            # 插入新记录
+            cursor.execute("""
+                INSERT INTO products (name, url, introduction, user_count, maker_link, maker_statement)
+                VALUES (?, ?, ?, ?, ?, ?)
+            """, (
+                product_info['name'], product_info['url'], product_info['introduction'],
+                product_info['user_count'], product_info['maker_link'], product_info['maker_statement']
+            ))
+            logger.info(f"保存产品信息: {product_info['name']}")
+        
+        conn.commit()
+        conn.close()
+    
+    def extract_product_name_from_url(self, url):
+        """从URL中提取产品名称"""
+        try:
+            parsed_url = urlparse(url)
+            path_parts = parsed_url.path.split('/')
+            
+            # 查找products路径段
+            for i, part in enumerate(path_parts):
+                if part == 'products' and i + 1 < len(path_parts):
+                    product_slug = path_parts[i + 1]
+                    # 将slug转换为可读的名称
+                    name = product_slug.replace('-', ' ').title()
+                    return name
+            
+            # 如果找不到products路径段，使用最后一个路径段
+            if path_parts:
+                last_part = path_parts[-1]
+                if last_part:
+                    name = last_part.replace('-', ' ').title()
+                    return name
+            
+            return "Unknown Product"
+        except Exception as e:
+            logger.error(f"从URL提取产品名称失败: {e}")
+            return "Unknown Product"
+    
+    def get_product_info_from_api(self, url):
+        """尝试通过API获取产品信息"""
+        try:
+            # 从URL中提取产品slug
+            parsed_url = urlparse(url)
+            path_parts = parsed_url.path.split('/')
+            
+            product_slug = None
+            for i, part in enumerate(path_parts):
+                if part == 'products' and i + 1 < len(path_parts):
+                    product_slug = path_parts[i + 1]
+                    break
+            
+            if not product_slug:
+                logger.warning(f"无法从URL中提取产品slug: {url}")
+                return None
+            
+            # 尝试使用ProductHunt的GraphQL API（需要API密钥）
+            # 这里我们使用一个简化的方法，只提取基本信息
+            
+            product_info = {
+                'url': url,
+                'name': self.extract_product_name_from_url(url),
+                'introduction': f"Product from ProductHunt: {product_slug}",
+                'user_count': None,  # 需要API访问
+                'maker_link': None,  # 需要API访问
+                'maker_statement': None  # 需要API访问
+            }
+            
+            logger.info(f"通过API获取产品信息: {product_info['name']}")
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"API获取产品信息失败: {e}")
+            return None
+    
+    def get_product_info_fallback(self, url):
+        """备用方法：从URL中提取基本信息"""
+        try:
+            product_name = self.extract_product_name_from_url(url)
+            
+            product_info = {
+                'url': url,
+                'name': product_name,
+                'introduction': f"Product from ProductHunt: {product_name}",
+                'user_count': None,
+                'maker_link': None,
+                'maker_statement': None
+            }
+            
+            logger.info(f"使用备用方法获取产品信息: {product_info['name']}")
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"备用方法获取产品信息失败: {e}")
+            return None
+    
+    def run_test(self):
+        """运行测试"""
+        # 从tophub_data.db获取ProductHunt链接
+        tophub_db_path = os.path.join(os.path.dirname(self.db_path), "..", "tophub_data.db")
+        
+        conn = sqlite3.connect(tophub_db_path)
+        cursor = conn.cursor()
+        
+        # 查询包含producthunt.com的链接
+        cursor.execute("""
+            SELECT url FROM articles 
+            WHERE url LIKE '%producthunt.com%' 
+            LIMIT 3
+        """)
+        
+        urls = [row[0] for row in cursor.fetchall()]
+        conn.close()
+        
+        logger.info(f"找到 {len(urls)} 个ProductHunt链接")
+        
+        # 处理每个URL
+        for url in urls:
+            logger.info(f"处理URL: {url}")
+            
+            # 尝试通过API获取产品信息
+            product_info = self.get_product_info_from_api(url)
+            
+            # 如果API失败，使用备用方法
+            if not product_info:
+                product_info = self.get_product_info_fallback(url)
+            
+            # 如果两种方法都失败，创建基本产品信息
+            if not product_info:
+                product_info = {
+                    'url': url,
+                    'name': 'Unknown Product',
+                    'introduction': 'Unable to fetch product information',
+                    'user_count': None,
+                    'maker_link': None,
+                    'maker_statement': None
+                }
+            
+            # 保存到数据库
+            self.save_product_info(product_info)
+        
+        # 统计结果
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT COUNT(*) FROM products")
+        count = cursor.fetchone()[0]
+        
+        cursor.execute("SELECT name, url FROM products")
+        products = cursor.fetchall()
+        conn.close()
+        
+        logger.success("测试任务完成")
+        
+        print("\n=== 测试结果统计 ===")
+        print(f"数据库中的产品数量: {count}")
+        print("已抓取的产品:")
+        for name, url in products:
+            print(f"  - {name}: {url}")
+
+def main():
+    """主函数"""
+    # 配置日志
+    logger.remove()
+    logger.add(
+        "api_scraper.log",
+        level="DEBUG",
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{function}:{line} - {message}",
+        rotation="10 MB",
+        retention="7 days"
+    )
+    
+    # 创建抓取器实例
+    scraper = ProductHuntAPIScraper()
+    
+    # 运行测试
+    scraper.run_test()
+
+if __name__ == "__main__":
+    main()
--- a/product/integrated_scraper.py
+++ b/product/integrated_scraper.py
@@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+全功能ProductHunt数据抓取器
+使用playwright-get-data.py中的专业功能绕过Cloudflare挑战
+"""
+
+import sqlite3
+import asyncio
+import os
+import argparse
+from datetime import datetime
+from loguru import logger
+from tqdm import tqdm
+import sys
+
+# 导入playwright-get-data.py中的功能
+import importlib.util
+
+# 动态导入playwright-get-data.py
+playwright_data_path = os.path.join(os.path.dirname(__file__), "playwright-get-data.py")
+spec = importlib.util.spec_from_file_location("playwright_get_data", playwright_data_path)
+playwright_get_data = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(playwright_get_data)
+ProductHuntScraper = playwright_get_data.ProductHuntScraper
+
+# 配置日志
+logger.remove()
+logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
+
+class ProductHuntScraperFull:
+    """全功能ProductHunt数据抓取器"""
+    
+    def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=10, skip_duplicates=True):
+        """
+        初始化抓取器
+        
+        Args:
+            tophub_db_path: tophub数据库路径
+            product_db_path: 产品数据库路径
+            debug_port: Chrome调试端口
+            limit: 抓取链接数量限制
+            skip_duplicates: 是否跳过已存在的URL
+        """
+        if tophub_db_path:
+            self.tophub_db_path = tophub_db_path
+        else:
+            self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
+            
+        if product_db_path:
+            self.product_db_path = product_db_path
+        else:
+            self.product_db_path = os.path.join(os.path.dirname(__file__), "products.db")
+            
+        self.debug_port = debug_port
+        self.limit = limit
+        self.skip_duplicates = skip_duplicates
+        self.product_urls = []
+        
+    def query_producthunt_urls(self, limit=None):
+        """查询包含producthunt.com的链接"""
+        if limit is None:
+            limit = self.limit
+            
+        logger.info(f"正在查询tophub_data.db数据库，限制: {limit}条")
+        
+        try:
+            conn = sqlite3.connect(self.tophub_db_path)
+            cursor = conn.cursor()
+            
+            # 查询包含producthunt.com的链接
+            if limit > 0:
+                cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%' LIMIT ?", (limit,))
+            else:
+                cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
+                
+            urls = [row[0] for row in cursor.fetchall()]
+            
+            conn.close()
+            
+            logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
+            return urls
+            
+        except Exception as e:
+            logger.error(f"查询数据库失败: {e}")
+            return []
+    
+    def init_product_database(self):
+        """初始化产品数据库"""
+        logger.info("正在初始化产品数据库...")
+        
+        try:
+            conn = sqlite3.connect(self.product_db_path)
+            cursor = conn.cursor()
+            
+            # 创建产品信息表
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS products (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    url TEXT NOT NULL UNIQUE,
+                    name TEXT,
+                    introduction TEXT,
+                    user_count TEXT,
+                    maker_link TEXT,
+                    maker_statement TEXT,
+                    created_at TEXT NOT NULL,
+                    updated_at TEXT NOT NULL
+                )
+            ''')
+            
+            conn.commit()
+            conn.close()
+            logger.success("产品数据库初始化完成")
+            
+        except Exception as e:
+            logger.error(f"初始化数据库失败: {e}")
+    
+    def check_duplicate(self, url):
+        """检查URL是否已存在"""
+        try:
+            conn = sqlite3.connect(self.product_db_path)
+            cursor = conn.cursor()
+            
+            cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
+            count = cursor.fetchone()[0]
+            
+            conn.close()
+            return count > 0
+            
+        except Exception as e:
+            logger.error(f"检查重复失败: {e}")
+            return False
+    
+    def save_product_info(self, product_info):
+        """保存产品信息到数据库"""
+        try:
+            conn = sqlite3.connect(self.product_db_path)
+            cursor = conn.cursor()
+            
+            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            
+            # 检查是否已存在
+            cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
+            existing = cursor.fetchone()
+            
+            if existing:
+                # 更新现有记录
+                cursor.execute('''
+                    UPDATE products SET 
+                    name = ?, introduction = ?, user_count = ?, 
+                    maker_link = ?, maker_statement = ?, updated_at = ?
+                    WHERE url = ?
+                ''', (
+                    product_info.get('name'),
+                    product_info.get('introduction'),
+                    product_info.get('user_count'),
+                    product_info.get('maker_link'),
+                    product_info.get('maker_statement'),
+                    current_time,
+                    product_info['url']
+                ))
+                logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
+            else:
+                # 插入新记录
+                cursor.execute('''
+                    INSERT INTO products 
+                    (url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (
+                    product_info['url'],
+                    product_info.get('name'),
+                    product_info.get('introduction'),
+                    product_info.get('user_count'),
+                    product_info.get('maker_link'),
+                    product_info.get('maker_statement'),
+                    current_time,
+                    current_time
+                ))
+                logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
+            
+            conn.commit()
+            conn.close()
+            return True
+            
+        except Exception as e:
+            logger.error(f"保存产品信息失败: {e}")
+            return False
+    
+    async def scrape_product_info(self, url):
+        """使用playwright-get-data.py中的专业功能抓取产品信息"""
+        try:
+            logger.info(f"开始抓取: {url}")
+            
+            # 创建ProductHuntScraper实例
+            scraper = ProductHuntScraper(debug_port=self.debug_port)
+            
+            # 连接到已运行的Chrome实例
+            connected = await scraper.connect_to_existing_chrome()
+            if not connected:
+                logger.error("连接Chrome失败，跳过此URL")
+                return None
+            
+            # 导航到ProductHunt页面
+            navigated = await scraper.navigate_to_producthunt(url)
+            if not navigated:
+                logger.error("导航到页面失败，跳过此URL")
+                await scraper.close()
+                return None
+            
+            # 提取产品信息
+            product_info = await scraper.extract_product_info()
+            if product_info:
+                product_info['url'] = url
+                logger.success(f"成功提取产品信息: {product_info.get('name', '未知')}")
+            else:
+                logger.error("提取产品信息失败")
+            
+            # 关闭连接
+            await scraper.close()
+            
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"抓取产品信息失败: {e}")
+            return None
+    
+    async def run_scraping(self, urls=None):
+        """运行抓取任务"""
+        logger.info("=== 开始ProductHunt数据抓取 ===")
+        
+        # 初始化数据库
+        self.init_product_database()
+        
+        # 获取要抓取的URL列表
+        if urls is None:
+            self.product_urls = self.query_producthunt_urls()
+        else:
+            self.product_urls = urls
+            
+        if not self.product_urls:
+            logger.error("未找到要抓取的ProductHunt链接")
+            return False
+        
+        logger.info(f"找到 {len(self.product_urls)} 个ProductHunt链接")
+        
+        # 统计抓取结果
+        success_count = 0
+        skip_count = 0
+        error_count = 0
+        
+        # 使用进度条显示处理进度
+        with tqdm(total=len(self.product_urls), desc="抓取ProductHunt链接") as pbar:
+            for url in self.product_urls:
+                logger.info(f"处理URL: {url}")
+                
+                # 检查是否已存在
+                if self.skip_duplicates and self.check_duplicate(url):
+                    logger.info(f"URL已存在，跳过: {url}")
+                    skip_count += 1
+                    pbar.update(1)
+                    continue
+                
+                # 抓取产品信息
+                product_info = await self.scrape_product_info(url)
+                
+                if product_info:
+                    # 保存到数据库
+                    success = self.save_product_info(product_info)
+                    if success:
+                        logger.success(f"成功保存产品信息: {product_info.get('name', '未知')}")
+                        success_count += 1
+                    else:
+                        logger.error(f"保存产品信息失败: {url}")
+                        error_count += 1
+                else:
+                    logger.error(f"抓取产品信息失败: {url}")
+                    error_count += 1
+                
+                pbar.update(1)
+        
+        # 显示抓取结果统计
+        self.show_scraping_results(success_count, skip_count, error_count)
+        
+        logger.success("=== ProductHunt数据抓取完成 ===")
+        return True
+    
+    def show_scraping_results(self, success_count, skip_count, error_count):
+        """显示抓取结果统计"""
+        try:
+            conn = sqlite3.connect(self.product_db_path)
+            cursor = conn.cursor()
+            
+            # 统计数据库中的产品数量
+            cursor.execute("SELECT COUNT(*) FROM products")
+            total_count = cursor.fetchone()[0]
+            
+            # 获取最新抓取的产品信息
+            cursor.execute("SELECT name, url FROM products ORDER BY updated_at DESC LIMIT 10")
+            recent_products = cursor.fetchall()
+            
+            conn.close()
+            
+            logger.info("=== 抓取结果统计 ===")
+            logger.info(f"成功抓取: {success_count} 个产品")
+            logger.info(f"跳过重复: {skip_count} 个链接")
+            logger.info(f"抓取失败: {error_count} 个链接")
+            logger.info(f"数据库中的产品总数: {total_count}")
+            
+            if recent_products:
+                logger.info("最新抓取的产品:")
+                for name, url in recent_products:
+                    logger.info(f"  - {name}: {url}")
+            else:
+                logger.info("数据库中暂无产品记录")
+                
+        except Exception as e:
+            logger.error(f"显示抓取结果失败: {e}")
+
+def parse_arguments():
+    """解析命令行参数"""
+    parser = argparse.ArgumentParser(description="全功能ProductHunt数据抓取器")
+    parser.add_argument("--tophub-db", help="tophub数据库路径", default=None)
+    parser.add_argument("--product-db", help="产品数据库路径", default=None)
+    parser.add_argument("--debug-port", type=int, help="Chrome调试端口", default=9222)
+    parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=10)
+    parser.add_argument("--no-skip-duplicates", action="store_true", help="不跳过重复URL")
+    parser.add_argument("--urls", nargs="+", help="指定要抓取的URL列表")
+    parser.add_argument("--log-file", help="日志文件路径", default="producthunt_scraper.log")
+    
+    return parser.parse_args()
+
+async def main():
+    """主函数"""
+    args = parse_arguments()
+    
+    # 配置日志文件输出
+    logger.add(args.log_file, level="INFO", rotation="10 MB")
+    
+    # 创建抓取器实例
+    scraper = ProductHuntScraperFull(
+        tophub_db_path=args.tophub_db,
+        product_db_path=args.product_db,
+        debug_port=args.debug_port,
+        limit=args.limit,
+        skip_duplicates=not args.no_skip_duplicates
+    )
+    
+    # 运行抓取任务
+    if args.urls:
+        await scraper.run_scraping(urls=args.urls)
+    else:
+        await scraper.run_scraping()
+
+if __name__ == "__main__":
+    # 运行异步主函数
+    asyncio.run(main())
--- a/product/product.db
+++ b/product/product.db
--- a/product/producthunt_scraper.py
+++ b/product/producthunt_scraper.py
@@ -0,0 +1,407 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ProductHunt数据抓取器
+从tophub_data.db查询包含producthunt.com的链接，然后使用Playwright抓取产品信息并保存到product.db
+"""
+
+import sqlite3
+import asyncio
+import os
+from datetime import datetime
+from loguru import logger
+from tqdm import tqdm
+import sys
+
+# 配置日志
+logger.remove()
+logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
+
+class ProductHuntScraper:
+    """ProductHunt数据抓取器"""
+    
+    def __init__(self):
+        self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
+        self.product_db_path = os.path.join(os.path.dirname(__file__), "product.db")
+        self.product_urls = []
+        
+    def query_producthunt_urls(self):
+        """查询包含producthunt.com的链接"""
+        logger.info("正在查询tophub_data.db数据库...")
+        
+        try:
+            conn = sqlite3.connect(self.tophub_db_path)
+            cursor = conn.cursor()
+            
+            # 查询包含producthunt.com的链接
+            cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
+            urls = [row[0] for row in cursor.fetchall()]
+            
+            conn.close()
+            
+            logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
+            return urls
+            
+        except Exception as e:
+            logger.error(f"查询数据库失败: {e}")
+            return []
+    
+    def init_product_database(self):
+        """初始化product.db数据库"""
+        logger.info("正在初始化product.db数据库...")
+        
+        try:
+            conn = sqlite3.connect(self.product_db_path)
+            cursor = conn.cursor()
+            
+            # 创建产品信息表
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS products (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    url TEXT NOT NULL UNIQUE,
+                    name TEXT,
+                    introduction TEXT,
+                    user_count TEXT,
+                    maker_link TEXT,
+                    maker_statement TEXT,
+                    created_at TEXT NOT NULL,
+                    updated_at TEXT NOT NULL
+                )
+            ''')
+            
+            conn.commit()
+            conn.close()
+            logger.success("product.db数据库初始化完成")
+            
+        except Exception as e:
+            logger.error(f"初始化数据库失败: {e}")
+    
+    def check_duplicate(self, url):
+        """检查URL是否已存在"""
+        try:
+            conn = sqlite3.connect(self.product_db_path)
+            cursor = conn.cursor()
+            
+            cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
+            count = cursor.fetchone()[0]
+            
+            conn.close()
+            return count > 0
+            
+        except Exception as e:
+            logger.error(f"检查重复失败: {e}")
+            return False
+    
+    def save_product_info(self, product_info):
+        """保存产品信息到数据库"""
+        try:
+            conn = sqlite3.connect(self.product_db_path)
+            cursor = conn.cursor()
+            
+            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            
+            # 检查是否已存在
+            cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
+            existing = cursor.fetchone()
+            
+            if existing:
+                # 更新现有记录
+                cursor.execute('''
+                    UPDATE products SET 
+                    name = ?, introduction = ?, user_count = ?, 
+                    maker_link = ?, maker_statement = ?, updated_at = ?
+                    WHERE url = ?
+                ''', (
+                    product_info.get('name'),
+                    product_info.get('introduction'),
+                    product_info.get('user_count'),
+                    product_info.get('maker_link'),
+                    product_info.get('maker_statement'),
+                    current_time,
+                    product_info['url']
+                ))
+                logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
+            else:
+                # 插入新记录
+                cursor.execute('''
+                    INSERT INTO products 
+                    (url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (
+                    product_info['url'],
+                    product_info.get('name'),
+                    product_info.get('introduction'),
+                    product_info.get('user_count'),
+                    product_info.get('maker_link'),
+                    product_info.get('maker_statement'),
+                    current_time,
+                    current_time
+                ))
+                logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
+            
+            conn.commit()
+            conn.close()
+            return True
+            
+        except Exception as e:
+            logger.error(f"保存产品信息失败: {e}")
+            return False
+    
+    async def scrape_product_info(self, url):
+        """使用Playwright抓取产品信息"""
+        try:
+            # 导入Playwright相关模块
+            from playwright.async_api import async_playwright
+            
+            logger.info(f"开始抓取: {url}")
+            
+            # 创建Playwright实例
+            playwright = await async_playwright().start()
+            browser = await playwright.chromium.launch(headless=True)
+            page = await browser.new_page()
+            
+            # 设置超时时间
+            page.set_default_timeout(120000)  # 增加超时时间以处理Cloudflare挑战
+            
+            # 导航到页面
+            await page.goto(url, wait_until="domcontentloaded")
+            
+            # 检查是否是Cloudflare挑战页面
+            page_title = await page.title()
+            if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
+                logger.info("检测到Cloudflare挑战页面，等待验证完成...")
+                
+                # 等待Cloudflare挑战完成
+                try:
+                    # 等待页面标题变化或特定元素出现
+                    await page.wait_for_function(
+                        """() => {
+                            const title = document.title;
+                            return !title.includes('请稍候') && 
+                                   !title.includes('Checking') && 
+                                   !title.includes('Verifying') &&
+                                   title !== '请稍候…';
+                        }""",
+                        timeout=300000  # 5分钟
+                    )
+                    logger.info("Cloudflare挑战已完成")
+                except Exception as e:
+                    logger.warning(f"等待Cloudflare挑战超时: {e}")
+            
+            # 等待页面加载
+            await page.wait_for_timeout(3000)
+            
+            product_info = {'url': url}
+            
+            # 提取产品名称 - 改进的XPath选择器
+            try:
+                # 尝试多种选择器
+                name_selectors = [
+                    "xpath=//h1",
+                    "xpath=//h1[@data-test='product-name']",
+                    "xpath=//h1[contains(@class, 'text')]",
+                    "xpath=//title"
+                ]
+                
+                for selector in name_selectors:
+                    name_element = await page.query_selector(selector)
+                    if name_element:
+                        name_text = (await name_element.text_content()).strip()
+                        # 过滤掉页面标题中的无关内容
+                        if name_text and 'Product Hunt' not in name_text and len(name_text) > 5:
+                            product_info['name'] = name_text
+                            logger.info(f"提取到产品名称: {product_info['name']}")
+                            break
+                
+                if 'name' not in product_info:
+                    logger.warning("未找到有效的产品名称元素")
+                    
+            except Exception as e:
+                logger.warning(f"提取产品名称失败: {e}")
+            
+            # 提取产品简介 - 改进的XPath选择器
+            try:
+                intro_selectors = [
+                    "xpath=//*[@class='relative text-16 font-normal text-gray-700']//div",
+                    "xpath=//p[contains(@class, 'description')]",
+                    "xpath=//div[contains(@class, 'description')]",
+                    "xpath=//meta[@name='description']"
+                ]
+                
+                for selector in intro_selectors:
+                    intro_element = await page.query_selector(selector)
+                    if intro_element:
+                        intro_text = (await intro_element.text_content()).strip()
+                        if intro_text:
+                            product_info['introduction'] = intro_text
+                            logger.info(f"提取到产品简介: {product_info['introduction'][:100]}...")
+                            break
+                
+                if 'introduction' not in product_info:
+                    logger.warning("未找到产品简介元素")
+                    
+            except Exception as e:
+                logger.warning(f"提取产品简介失败: {e}")
+            
+            # 提取用户数 - 改进的XPath选择器
+            try:
+                user_count_selectors = [
+                    "xpath=//*[@class='flex flex-row gap-2']//div/div[2]/span/p",
+                    "xpath=//span[contains(text(), 'users')]",
+                    "xpath=//span[contains(text(), 'upvotes')]",
+                    "xpath=//div[contains(@class, 'stats')]"
+                ]
+                
+                for selector in user_count_selectors:
+                    user_count_element = await page.query_selector(selector)
+                    if user_count_element:
+                        user_count_text = (await user_count_element.text_content()).strip()
+                        if user_count_text:
+                            product_info['user_count'] = user_count_text
+                            logger.info(f"提取到用户数: {product_info['user_count']}")
+                            break
+                
+                if 'user_count' not in product_info:
+                    logger.warning("未找到用户数元素")
+                    
+            except Exception as e:
+                logger.warning(f"提取用户数失败: {e}")
+            
+            # 提取制作人链接 - 改进的XPath选择器
+            try:
+                maker_link_selectors = [
+                    "xpath=//span[contains(@class, 'absolute')]",
+                    "xpath=//a[contains(@href, 'hunter')]",
+                    "xpath=//a[contains(text(), 'hunter')]",
+                    "xpath=//a[contains(@class, 'maker')]"
+                ]
+                
+                for selector in maker_link_selectors:
+                    maker_element = await page.query_selector(selector)
+                    if maker_element:
+                        # 如果是span，找父级a标签
+                        if 'span' in selector:
+                            a_element = await maker_element.evaluate_handle('(element) => element.closest("a")')
+                            if a_element:
+                                maker_link = await a_element.get_attribute('href')
+                        else:
+                            maker_link = await maker_element.get_attribute('href')
+                        
+                        if maker_link and not maker_link.startswith('http'):
+                            base_url = "https://www.producthunt.com"
+                            if maker_link.startswith('/'):
+                                maker_link = base_url + maker_link
+                            else:
+                                maker_link = base_url + '/' + maker_link
+                        
+                        if maker_link:
+                            product_info['maker_link'] = maker_link
+                            logger.info(f"提取到制作人链接: {maker_link}")
+                            break
+                
+                if 'maker_link' not in product_info:
+                    logger.warning("未找到制作人链接元素")
+                    
+            except Exception as e:
+                logger.warning(f"提取制作人链接失败: {e}")
+            
+            # 提取制作人发言（简化版本）
+            try:
+                if product_info.get('maker_link'):
+                    # 在新页面中打开制作人链接
+                    new_page = await browser.new_page()
+                    await new_page.goto(product_info['maker_link'], wait_until="domcontentloaded")
+                    await new_page.wait_for_timeout(5000)
+                    
+                    # 尝试多种选择器提取发言内容
+                    statement_selectors = [
+                        "xpath=//*[@id='comment-4597755']/div/div[2]/div/div/div",
+                        "xpath=//div[contains(@class, 'comment')]",
+                        "xpath=//p[contains(@class, 'comment')]",
+                        "xpath=//article"
+                    ]
+                    
+                    for selector in statement_selectors:
+                        comment_element = await new_page.query_selector(selector)
+                        if comment_element:
+                            statement_text = (await comment_element.text_content()).strip()
+                            if statement_text and len(statement_text) > 10:
+                                product_info['maker_statement'] = statement_text
+                                logger.info(f"提取到制作人发言: {product_info['maker_statement'][:100]}...")
+                                break
+                    
+                    await new_page.close()
+                else:
+                    logger.warning("没有制作人链接，跳过提取制作人发言")
+            except Exception as e:
+                logger.warning(f"提取制作人发言失败: {e}")
+            
+            # 关闭浏览器
+            await browser.close()
+            await playwright.stop()
+            
+            logger.success(f"抓取完成: {product_info.get('name', '未知')}")
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"抓取产品信息失败: {e}")
+            return {'url': url}
+    
+    async def process_urls(self):
+        """处理所有URL"""
+        # 查询URL
+        self.product_urls = self.query_producthunt_urls()
+        
+        if not self.product_urls:
+            logger.warning("未找到包含producthunt.com的链接")
+            return
+        
+        # 初始化数据库
+        self.init_product_database()
+        
+        logger.info(f"开始处理 {len(self.product_urls)} 个产品链接")
+        
+        # 创建进度条
+        with tqdm(total=len(self.product_urls), desc="处理进度") as pbar:
+            for url in self.product_urls:
+                try:
+                    # 检查是否已存在
+                    if self.check_duplicate(url):
+                        logger.info(f"跳过已存在的链接: {url}")
+                        pbar.update(1)
+                        continue
+                    
+                    # 抓取产品信息
+                    product_info = await self.scrape_product_info(url)
+                    
+                    # 保存到数据库
+                    if product_info:
+                        self.save_product_info(product_info)
+                    
+                    pbar.update(1)
+                    
+                except Exception as e:
+                    logger.error(f"处理链接失败 {url}: {e}")
+                    pbar.update(1)
+    
+    def run(self):
+        """运行主程序"""
+        logger.info("开始ProductHunt数据抓取任务")
+        
+        try:
+            # 运行异步任务
+            asyncio.run(self.process_urls())
+            logger.success("任务完成")
+            
+        except Exception as e:
+            logger.error(f"程序执行失败: {e}")
+
+
+def main():
+    """主函数"""
+    scraper = ProductHuntScraper()
+    scraper.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/product/products.db
+++ b/product/products.db