更新了抓取producthunt的代码

2025-11-23 22:14:53 +08:00
parent 9088939701
commit 4a48b9a9cb
9 changed files with 260 additions and 480 deletions
--- a/product/integrated_scraper.py
+++ b/product/integrated_scraper.py
@@ -31,7 +31,7 @@ logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</
 class ProductHuntScraperFull:
    """全功能ProductHunt数据抓取器"""
    
-    def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=10, skip_duplicates=True):
+    def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=0, skip_duplicates=True):
        """
        初始化抓取器
        
@@ -68,12 +68,9 @@ class ProductHuntScraperFull:
            conn = sqlite3.connect(self.tophub_db_path)
            cursor = conn.cursor()
            
-            # 查询包含producthunt.com的链接
-            if limit > 0:
-                cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%' LIMIT ?", (limit,))
-            else:
-                cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
-                
+            # 查询包含producthunt.com的链接（去掉LIMIT限制）
+            cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
+            
            urls = [row[0] for row in cursor.fetchall()]
            
            conn.close()
@@ -322,7 +319,7 @@ def parse_arguments():
    parser.add_argument("--tophub-db", help="tophub数据库路径", default=None)
    parser.add_argument("--product-db", help="产品数据库路径", default=None)
    parser.add_argument("--debug-port", type=int, help="Chrome调试端口", default=9222)
-    parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=10)
+    parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=0)
    parser.add_argument("--no-skip-duplicates", action="store_true", help="不跳过重复URL")
    parser.add_argument("--urls", nargs="+", help="指定要抓取的URL列表")
    parser.add_argument("--log-file", help="日志文件路径", default="producthunt_scraper.log")