增加对producthunt网站的数据爬取

2025-11-17 07:39:45 +08:00
parent 256850f752
commit d07017cf11
27 changed files with 26638 additions and 2153 deletions
--- a/run_stealth_example.py
+++ b/run_stealth_example.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+playwright_stealth 使用示例
+演示如何使用 stealth 模式运行 ProductHunt 爬虫
+"""
+
+import asyncio
+from loguru import logger
+from product.new_data_stealth import ProductHuntScraper
+
+async def run_stealth_scraper():
+    """运行 stealth 版本的爬虫"""
+    logger.info("开始运行 stealth 版本的 ProductHunt 爬虫")
+    
+    # 创建爬虫实例
+    scraper = ProductHuntScraper()
+    
+    # 执行爬取
+    success = await scraper.scrape()
+    
+    if success:
+        logger.success("Stealth 爬虫执行成功！")
+        logger.info("生成的文件：")
+        logger.info("- product_info_stealth.json: 产品信息数据")
+        logger.info("- product_page_stealth.html: 页面HTML内容")
+        logger.info("- product_screenshot_stealth.png: 页面截图")
+    else:
+        logger.error("Stealth 爬虫执行失败")
+    
+    return success
+
+def main():
+    """主函数"""
+    logger.info("=== playwright_stealth 使用示例 ===")
+    logger.info("此示例演示如何使用 playwright_stealth 模块增强浏览器反检测能力")
+    
+    # 运行异步任务
+    asyncio.run(run_stealth_scraper())
+
+if __name__ == "__main__":
+    main()