增加对producthunt网站的数据爬取

2025-11-17 07:39:45 +08:00
parent 256850f752
commit d07017cf11
27 changed files with 26638 additions and 2153 deletions
--- a/product/new_data_requests.py
+++ b/product/new_data_requests.py
@@ -0,0 +1,176 @@
+import os
+import json
+import time
+from datetime import datetime
+import requests
+from bs4 import BeautifulSoup
+from loguru import logger
+
+class ProductHuntScraper:
+    def __init__(self):
+        self.session = requests.Session()
+        self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
+        
+        # 设置请求头，模拟浏览器访问
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Referer': 'https://www.producthunt.com/',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        }
+        
+    def get_page_content(self):
+        """获取页面内容"""
+        try:
+            logger.info(f"正在获取页面内容: {self.product_url}")
+            response = self.session.get(self.product_url, headers=self.headers)
+            
+            # 检查响应状态码
+            if response.status_code == 200:
+                logger.info("成功获取页面内容")
+                return response.text
+            else:
+                logger.error(f"获取页面失败，状态码: {response.status_code}")
+                return None
+                
+        except Exception as e:
+            logger.error(f"获取页面内容失败: {str(e)}")
+            return None
+    
+    def extract_product_info(self, html_content):
+        """从HTML内容中提取产品信息"""
+        try:
+            logger.info("开始解析HTML内容")
+            soup = BeautifulSoup(html_content, 'html.parser')
+            
+            product_info = {
+                "url": self.product_url,
+                "scraped_at": datetime.now().isoformat()
+            }
+            
+            # 提取产品名称 (h1标签)
+            try:
+                name_element = soup.find('h1')
+                if name_element:
+                    product_info["name"] = name_element.get_text(strip=True)
+                    logger.info(f"产品名称: {product_info['name']}")
+                else:
+                    logger.warning("未找到产品名称 (h1标签)")
+                    product_info["name"] = "未找到"
+            except Exception as e:
+                logger.warning(f"提取产品名称时出错: {str(e)}")
+                product_info["name"] = "未找到"
+            
+            # 提取产品简介 - 尝试多种可能的CSS选择器
+            desc_selectors = [
+                "div.relative.text-16.font-normal.text-gray-700",
+                ".text-16.font-normal.text-gray-700",
+                "[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
+                "div[class*='description']",
+                ".product-description",
+                "div[class*='tagline']"
+            ]
+            
+            for selector in desc_selectors:
+                try:
+                    desc_element = soup.select_one(selector)
+                    if desc_element:
+                        product_info["description"] = desc_element.get_text(strip=True)
+                        logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
+                        break
+                except Exception as e:
+                    logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")
+            
+            if "description" not in product_info:
+                logger.warning("未找到产品简介")
+                product_info["description"] = "未找到"
+            
+            # 提取第一个评论 - 尝试多种可能的CSS选择器
+            comment_selectors = [
+                "div.flex.flex-1.flex-col.gap-2",
+                ".flex.flex-1.flex-col.gap-2",
+                "[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
+                "div[class*='comment']",
+                ".comment-text",
+                "div[class*='review']"
+            ]
+            
+            for selector in comment_selectors:
+                try:
+                    comment_element = soup.select_one(selector)
+                    if comment_element:
+                        product_info["first_comment"] = comment_element.get_text(strip=True)
+                        logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
+                        break
+                except Exception as e:
+                    logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")
+            
+            if "first_comment" not in product_info:
+                logger.warning("未找到第一个评论")
+                product_info["first_comment"] = "未找到"
+            
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"解析HTML内容失败: {str(e)}")
+            return None
+    
+    def save_to_file(self, data, filename="product_info.json"):
+        """保存数据到文件"""
+        try:
+            with open(filename, "w", encoding="utf-8") as f:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+            logger.info(f"数据已保存到 {filename}")
+            return True
+        except Exception as e:
+            logger.error(f"保存数据失败: {str(e)}")
+            return False
+    
+    def save_html(self, html_content, filename="product_page.html"):
+        """保存HTML内容到文件，用于调试"""
+        try:
+            with open(filename, "w", encoding="utf-8") as f:
+                f.write(html_content)
+            logger.info(f"HTML内容已保存到 {filename}")
+            return True
+        except Exception as e:
+            logger.error(f"保存HTML内容失败: {str(e)}")
+            return False
+    
+    def scrape_product(self):
+        """执行完整的抓取流程"""
+        html_content = self.get_page_content()
+        if not html_content:
+            logger.error("无法获取页面内容")
+            return False
+        
+        # 保存HTML内容用于调试
+        self.save_html(html_content)
+        
+        product_info = self.extract_product_info(html_content)
+        if product_info:
+            self.save_to_file(product_info)
+            return True
+        else:
+            logger.error("未能提取产品信息")
+            return False
+
+def main():
+    logger.info("开始ProductHunt产品信息抓取")
+    scraper = ProductHuntScraper()
+    
+    # 可以修改product_url来抓取其他产品
+    # scraper.product_url = "https://www.producthunt.com/products/your-product"
+    
+    success = scraper.scrape_product()
+    
+    if success:
+        logger.info("产品信息抓取完成")
+    else:
+        logger.error("产品信息抓取失败")
+
+if __name__ == "__main__":
+    main()