增加对producthunt网站的数据爬取

2025-11-17 07:39:45 +08:00
parent 256850f752
commit d07017cf11
27 changed files with 26638 additions and 2153 deletions
--- a/product/new_data_simple.py
+++ b/product/new_data_simple.py
@@ -0,0 +1,172 @@
+import os
+import json
+import time
+from datetime import datetime
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from loguru import logger
+
+class ProductHuntScraper:
+    def __init__(self):
+        self.driver = None
+        self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
+        
+    def connect_to_chrome(self):
+        """连接到Chrome实例"""
+        try:
+            logger.info("正在初始化Chrome驱动...")
+            
+            # 配置Chrome选项
+            chrome_options = Options()
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--window-size=1920,1080")
+            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+            
+            # 尝试直接使用ChromeDriver
+            try:
+                self.driver = webdriver.Chrome(options=chrome_options)
+                logger.info("成功连接到Chrome实例")
+                return True
+            except Exception as e:
+                logger.error(f"使用ChromeDriver连接失败: {str(e)}")
+                
+                # 尝试使用系统Chrome
+                try:
+                    chrome_options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
+                    self.driver = webdriver.Chrome(options=chrome_options)
+                    logger.info("成功连接到系统Chrome实例")
+                    return True
+                except Exception as e2:
+                    logger.error(f"使用系统Chrome连接失败: {str(e2)}")
+                    return False
+                    
+        except Exception as e:
+            logger.error(f"连接Chrome实例失败: {str(e)}")
+            return False
+    
+    def navigate_to_product(self):
+        """导航到产品页面"""
+        try:
+            logger.info(f"正在导航到产品页面: {self.product_url}")
+            self.driver.get(self.product_url)
+            # 等待页面加载
+            WebDriverWait(self.driver, 10).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
+            logger.info("页面加载完成")
+            return True
+        except TimeoutException:
+            logger.error("页面加载超时")
+            return False
+        except Exception as e:
+            logger.error(f"导航到产品页面失败: {str(e)}")
+            return False
+    
+    def extract_product_info(self):
+        """提取产品信息"""
+        try:
+            logger.info("开始提取产品信息")
+            
+            # 等待页面完全加载
+            time.sleep(5)
+            
+            product_info = {
+                "url": self.product_url,
+                "scraped_at": datetime.now().isoformat()
+            }
+            
+            # 提取产品名称 (h1标签)
+            try:
+                name_element = self.driver.find_element(By.TAG_NAME, "h1")
+                product_info["name"] = name_element.text.strip()
+                logger.info(f"产品名称: {product_info['name']}")
+            except NoSuchElementException:
+                logger.warning("未找到产品名称 (h1标签)")
+                product_info["name"] = "未找到"
+            
+            # 提取产品简介 (class为"relative text-16 font-normal text-gray-700"的div)
+            try:
+                desc_selector = "div.relative.text-16.font-normal.text-gray-700"
+                desc_element = self.driver.find_element(By.CSS_SELECTOR, desc_selector)
+                product_info["description"] = desc_element.text.strip()
+                logger.info(f"产品简介: {product_info['description'][:50]}...")
+            except NoSuchElementException:
+                logger.warning("未找到产品简介 (div.relative.text-16.font-normal.text-gray-700)")
+                product_info["description"] = "未找到"
+            
+            # 提取第一个评论 (class为"flex flex-1 flex-col gap-2"的div)
+            try:
+                comment_selector = "div.flex.flex-1.flex-col.gap-2"
+                comment_element = self.driver.find_element(By.CSS_SELECTOR, comment_selector)
+                product_info["first_comment"] = comment_element.text.strip()
+                logger.info(f"第一个评论: {product_info['first_comment'][:50]}...")
+            except NoSuchElementException:
+                logger.warning("未找到第一个评论 (div.flex.flex-1.flex-col.gap-2)")
+                product_info["first_comment"] = "未找到"
+            
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"提取产品信息失败: {str(e)}")
+            return None
+    
+    def save_to_file(self, data, filename="product_info.json"):
+        """保存数据到文件"""
+        try:
+            with open(filename, "w", encoding="utf-8") as f:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+            logger.info(f"数据已保存到 {filename}")
+            return True
+        except Exception as e:
+            logger.error(f"保存数据失败: {str(e)}")
+            return False
+    
+    def close(self):
+        """关闭浏览器"""
+        if self.driver:
+            self.driver.quit()
+            logger.info("浏览器已关闭")
+    
+    def scrape_product(self):
+        """执行完整的抓取流程"""
+        if not self.connect_to_chrome():
+            logger.error("无法连接到Chrome实例")
+            return False
+        
+        try:
+            if not self.navigate_to_product():
+                logger.error("无法导航到产品页面")
+                return False
+            
+            product_info = self.extract_product_info()
+            if product_info:
+                self.save_to_file(product_info)
+                return True
+            else:
+                logger.error("未能提取产品信息")
+                return False
+        finally:
+            self.close()
+
+def main():
+    logger.info("开始ProductHunt产品信息抓取")
+    scraper = ProductHuntScraper()
+    
+    # 可以修改product_url来抓取其他产品
+    # scraper.product_url = "https://www.producthunt.com/products/your-product"
+    
+    success = scraper.scrape_product()
+    
+    if success:
+        logger.info("产品信息抓取完成")
+    else:
+        logger.error("产品信息抓取失败")
+
+if __name__ == "__main__":
+    main()