增加对producthunt网站的数据爬取

This commit is contained in:
2025-11-17 07:39:45 +08:00
parent 256850f752
commit d07017cf11
27 changed files with 26638 additions and 2153 deletions

View File

@@ -0,0 +1,176 @@
import os
import json
import time
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from loguru import logger
class ProductHuntScraper:
def __init__(self):
self.session = requests.Session()
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
# 设置请求头,模拟浏览器访问
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.producthunt.com/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
def get_page_content(self):
"""获取页面内容"""
try:
logger.info(f"正在获取页面内容: {self.product_url}")
response = self.session.get(self.product_url, headers=self.headers)
# 检查响应状态码
if response.status_code == 200:
logger.info("成功获取页面内容")
return response.text
else:
logger.error(f"获取页面失败,状态码: {response.status_code}")
return None
except Exception as e:
logger.error(f"获取页面内容失败: {str(e)}")
return None
def extract_product_info(self, html_content):
"""从HTML内容中提取产品信息"""
try:
logger.info("开始解析HTML内容")
soup = BeautifulSoup(html_content, 'html.parser')
product_info = {
"url": self.product_url,
"scraped_at": datetime.now().isoformat()
}
# 提取产品名称 (h1标签)
try:
name_element = soup.find('h1')
if name_element:
product_info["name"] = name_element.get_text(strip=True)
logger.info(f"产品名称: {product_info['name']}")
else:
logger.warning("未找到产品名称 (h1标签)")
product_info["name"] = "未找到"
except Exception as e:
logger.warning(f"提取产品名称时出错: {str(e)}")
product_info["name"] = "未找到"
# 提取产品简介 - 尝试多种可能的CSS选择器
desc_selectors = [
"div.relative.text-16.font-normal.text-gray-700",
".text-16.font-normal.text-gray-700",
"[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
"div[class*='description']",
".product-description",
"div[class*='tagline']"
]
for selector in desc_selectors:
try:
desc_element = soup.select_one(selector)
if desc_element:
product_info["description"] = desc_element.get_text(strip=True)
logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
break
except Exception as e:
logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")
if "description" not in product_info:
logger.warning("未找到产品简介")
product_info["description"] = "未找到"
# 提取第一个评论 - 尝试多种可能的CSS选择器
comment_selectors = [
"div.flex.flex-1.flex-col.gap-2",
".flex.flex-1.flex-col.gap-2",
"[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
"div[class*='comment']",
".comment-text",
"div[class*='review']"
]
for selector in comment_selectors:
try:
comment_element = soup.select_one(selector)
if comment_element:
product_info["first_comment"] = comment_element.get_text(strip=True)
logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
break
except Exception as e:
logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")
if "first_comment" not in product_info:
logger.warning("未找到第一个评论")
product_info["first_comment"] = "未找到"
return product_info
except Exception as e:
logger.error(f"解析HTML内容失败: {str(e)}")
return None
def save_to_file(self, data, filename="product_info.json"):
"""保存数据到文件"""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"数据已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存数据失败: {str(e)}")
return False
def save_html(self, html_content, filename="product_page.html"):
"""保存HTML内容到文件用于调试"""
try:
with open(filename, "w", encoding="utf-8") as f:
f.write(html_content)
logger.info(f"HTML内容已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存HTML内容失败: {str(e)}")
return False
def scrape_product(self):
"""执行完整的抓取流程"""
html_content = self.get_page_content()
if not html_content:
logger.error("无法获取页面内容")
return False
# 保存HTML内容用于调试
self.save_html(html_content)
product_info = self.extract_product_info(html_content)
if product_info:
self.save_to_file(product_info)
return True
else:
logger.error("未能提取产品信息")
return False
def main():
logger.info("开始ProductHunt产品信息抓取")
scraper = ProductHuntScraper()
# 可以修改product_url来抓取其他产品
# scraper.product_url = "https://www.producthunt.com/products/your-product"
success = scraper.scrape_product()
if success:
logger.info("产品信息抓取完成")
else:
logger.error("产品信息抓取失败")
if __name__ == "__main__":
main()