更新了抓取producthunt的代码

This commit is contained in:
2025-11-23 22:14:53 +08:00
parent 9088939701
commit 4a48b9a9cb
9 changed files with 260 additions and 480 deletions

View File

@@ -31,7 +31,7 @@ logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</
class ProductHuntScraperFull:
"""全功能ProductHunt数据抓取器"""
def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=10, skip_duplicates=True):
def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=0, skip_duplicates=True):
"""
初始化抓取器
@@ -68,12 +68,9 @@ class ProductHuntScraperFull:
conn = sqlite3.connect(self.tophub_db_path)
cursor = conn.cursor()
# 查询包含producthunt.com的链接
if limit > 0:
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%' LIMIT ?", (limit,))
else:
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
# 查询包含producthunt.com的链接去掉LIMIT限制
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
urls = [row[0] for row in cursor.fetchall()]
conn.close()
@@ -322,7 +319,7 @@ def parse_arguments():
parser.add_argument("--tophub-db", help="tophub数据库路径", default=None)
parser.add_argument("--product-db", help="产品数据库路径", default=None)
parser.add_argument("--debug-port", type=int, help="Chrome调试端口", default=9222)
parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=10)
parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=0)
parser.add_argument("--no-skip-duplicates", action="store_true", help="不跳过重复URL")
parser.add_argument("--urls", nargs="+", help="指定要抓取的URL列表")
parser.add_argument("--log-file", help="日志文件路径", default="producthunt_scraper.log")