import os import json import time from datetime import datetime import requests from bs4 import BeautifulSoup from loguru import logger class ProductHuntScraper: def __init__(self): self.session = requests.Session() self.product_url = "https://www.producthunt.com/products/elsie-ai-beta" # 设置更复杂的请求头,模拟真实浏览器 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Cache-Control': 'max-age=0', 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } def get_page_content(self): """获取页面内容""" try: logger.info(f"正在获取页面内容: {self.product_url}") # 首先访问主页 logger.info("首先访问ProductHunt主页") main_page = self.session.get("https://www.producthunt.com/", headers=self.headers) logger.info(f"主页状态码: {main_page.status_code}") # 等待一下模拟人类行为 time.sleep(2) # 然后访问产品页面 response = self.session.get(self.product_url, headers=self.headers) # 检查响应状态码 if response.status_code == 200: logger.info("成功获取页面内容") return response.text else: logger.error(f"获取页面失败,状态码: {response.status_code}") logger.info(f"响应头: {response.headers}") return None except Exception as e: logger.error(f"获取页面内容失败: {str(e)}") return None def extract_product_info(self, html_content): """从HTML内容中提取产品信息""" try: logger.info("开始解析HTML内容") soup = BeautifulSoup(html_content, 'html.parser') product_info = { "url": self.product_url, "scraped_at": datetime.now().isoformat() } # 提取产品名称 (h1标签) try: name_element = soup.find('h1') if name_element: product_info["name"] = name_element.get_text(strip=True) logger.info(f"产品名称: {product_info['name']}") else: logger.warning("未找到产品名称 (h1标签)") product_info["name"] = "未找到" except Exception as e: logger.warning(f"提取产品名称时出错: {str(e)}") product_info["name"] = "未找到" # 提取产品简介 - 尝试多种可能的CSS选择器 desc_selectors = [ "div.relative.text-16.font-normal.text-gray-700", ".text-16.font-normal.text-gray-700", "[class*='text-16'][class*='font-normal'][class*='text-gray-700']", "div[class*='description']", ".product-description", "div[class*='tagline']", "div[class*='subtitle']", "p[class*='text-gray']", "div[class*='mb-4']" ] for selector in desc_selectors: try: desc_element = soup.select_one(selector) if desc_element and desc_element.get_text(strip=True): product_info["description"] = desc_element.get_text(strip=True) logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...") break except Exception as e: logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}") if "description" not in product_info: logger.warning("未找到产品简介") product_info["description"] = "未找到" # 提取第一个评论 - 尝试多种可能的CSS选择器 comment_selectors = [ "div.flex.flex-1.flex-col.gap-2", ".flex.flex-1.flex-col.gap-2", "[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']", "div[class*='comment']", ".comment-text", "div[class*='review']", "div[class*='feedback']", "blockquote", "div[class*='border']" ] for selector in comment_selectors: try: comment_element = soup.select_one(selector) if comment_element and comment_element.get_text(strip=True): product_info["first_comment"] = comment_element.get_text(strip=True) logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...") break except Exception as e: logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}") if "first_comment" not in product_info: logger.warning("未找到第一个评论") product_info["first_comment"] = "未找到" # 尝试提取其他有用信息 try: # 尝试获取产品标签 tag_elements = soup.select("[class*='tag'], [class*='category'], [class*='topic']") if tag_elements: tags = [tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)] product_info["tags"] = tags[:5] # 最多取5个标签 logger.info(f"找到标签: {tags[:3]}") except Exception as e: logger.debug(f"提取标签时出错: {str(e)}") return product_info except Exception as e: logger.error(f"解析HTML内容失败: {str(e)}") return None def save_to_file(self, data, filename="product_info.json"): """保存数据到文件""" try: with open(filename, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) logger.info(f"数据已保存到 {filename}") return True except Exception as e: logger.error(f"保存数据失败: {str(e)}") return False def save_html(self, html_content, filename="product_page.html"): """保存HTML内容到文件,用于调试""" try: with open(filename, "w", encoding="utf-8") as f: f.write(html_content) logger.info(f"HTML内容已保存到 {filename}") return True except Exception as e: logger.error(f"保存HTML内容失败: {str(e)}") return False def scrape_product(self): """执行完整的抓取流程""" html_content = self.get_page_content() if not html_content: logger.error("无法获取页面内容") return False # 保存HTML内容用于调试 self.save_html(html_content) product_info = self.extract_product_info(html_content) if product_info: self.save_to_file(product_info) return True else: logger.error("未能提取产品信息") return False def main(): logger.info("开始ProductHunt产品信息抓取") scraper = ProductHuntScraper() # 可以修改product_url来抓取其他产品 # scraper.product_url = "https://www.producthunt.com/products/your-product" success = scraper.scrape_product() if success: logger.info("产品信息抓取完成") else: logger.error("产品信息抓取失败") if __name__ == "__main__": main()