176 lines
6.9 KiB
Python
176 lines
6.9 KiB
Python
|
|
import os
|
|||
|
|
import json
|
|||
|
|
import time
|
|||
|
|
from datetime import datetime
|
|||
|
|
import requests
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
from loguru import logger
|
|||
|
|
|
|||
|
|
class ProductHuntScraper:
|
|||
|
|
def __init__(self):
|
|||
|
|
self.session = requests.Session()
|
|||
|
|
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
|
|||
|
|
|
|||
|
|
# 设置请求头,模拟浏览器访问
|
|||
|
|
self.headers = {
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|||
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|||
|
|
'Referer': 'https://www.producthunt.com/',
|
|||
|
|
'DNT': '1',
|
|||
|
|
'Connection': 'keep-alive',
|
|||
|
|
'Upgrade-Insecure-Requests': '1',
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def get_page_content(self):
|
|||
|
|
"""获取页面内容"""
|
|||
|
|
try:
|
|||
|
|
logger.info(f"正在获取页面内容: {self.product_url}")
|
|||
|
|
response = self.session.get(self.product_url, headers=self.headers)
|
|||
|
|
|
|||
|
|
# 检查响应状态码
|
|||
|
|
if response.status_code == 200:
|
|||
|
|
logger.info("成功获取页面内容")
|
|||
|
|
return response.text
|
|||
|
|
else:
|
|||
|
|
logger.error(f"获取页面失败,状态码: {response.status_code}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"获取页面内容失败: {str(e)}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def extract_product_info(self, html_content):
|
|||
|
|
"""从HTML内容中提取产品信息"""
|
|||
|
|
try:
|
|||
|
|
logger.info("开始解析HTML内容")
|
|||
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|||
|
|
|
|||
|
|
product_info = {
|
|||
|
|
"url": self.product_url,
|
|||
|
|
"scraped_at": datetime.now().isoformat()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 提取产品名称 (h1标签)
|
|||
|
|
try:
|
|||
|
|
name_element = soup.find('h1')
|
|||
|
|
if name_element:
|
|||
|
|
product_info["name"] = name_element.get_text(strip=True)
|
|||
|
|
logger.info(f"产品名称: {product_info['name']}")
|
|||
|
|
else:
|
|||
|
|
logger.warning("未找到产品名称 (h1标签)")
|
|||
|
|
product_info["name"] = "未找到"
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取产品名称时出错: {str(e)}")
|
|||
|
|
product_info["name"] = "未找到"
|
|||
|
|
|
|||
|
|
# 提取产品简介 - 尝试多种可能的CSS选择器
|
|||
|
|
desc_selectors = [
|
|||
|
|
"div.relative.text-16.font-normal.text-gray-700",
|
|||
|
|
".text-16.font-normal.text-gray-700",
|
|||
|
|
"[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
|
|||
|
|
"div[class*='description']",
|
|||
|
|
".product-description",
|
|||
|
|
"div[class*='tagline']"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in desc_selectors:
|
|||
|
|
try:
|
|||
|
|
desc_element = soup.select_one(selector)
|
|||
|
|
if desc_element:
|
|||
|
|
product_info["description"] = desc_element.get_text(strip=True)
|
|||
|
|
logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
|
|||
|
|
break
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
if "description" not in product_info:
|
|||
|
|
logger.warning("未找到产品简介")
|
|||
|
|
product_info["description"] = "未找到"
|
|||
|
|
|
|||
|
|
# 提取第一个评论 - 尝试多种可能的CSS选择器
|
|||
|
|
comment_selectors = [
|
|||
|
|
"div.flex.flex-1.flex-col.gap-2",
|
|||
|
|
".flex.flex-1.flex-col.gap-2",
|
|||
|
|
"[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
|
|||
|
|
"div[class*='comment']",
|
|||
|
|
".comment-text",
|
|||
|
|
"div[class*='review']"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in comment_selectors:
|
|||
|
|
try:
|
|||
|
|
comment_element = soup.select_one(selector)
|
|||
|
|
if comment_element:
|
|||
|
|
product_info["first_comment"] = comment_element.get_text(strip=True)
|
|||
|
|
logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
|
|||
|
|
break
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
if "first_comment" not in product_info:
|
|||
|
|
logger.warning("未找到第一个评论")
|
|||
|
|
product_info["first_comment"] = "未找到"
|
|||
|
|
|
|||
|
|
return product_info
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"解析HTML内容失败: {str(e)}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def save_to_file(self, data, filename="product_info.json"):
|
|||
|
|
"""保存数据到文件"""
|
|||
|
|
try:
|
|||
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|||
|
|
logger.info(f"数据已保存到 {filename}")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"保存数据失败: {str(e)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def save_html(self, html_content, filename="product_page.html"):
|
|||
|
|
"""保存HTML内容到文件,用于调试"""
|
|||
|
|
try:
|
|||
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|||
|
|
f.write(html_content)
|
|||
|
|
logger.info(f"HTML内容已保存到 {filename}")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"保存HTML内容失败: {str(e)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def scrape_product(self):
|
|||
|
|
"""执行完整的抓取流程"""
|
|||
|
|
html_content = self.get_page_content()
|
|||
|
|
if not html_content:
|
|||
|
|
logger.error("无法获取页面内容")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 保存HTML内容用于调试
|
|||
|
|
self.save_html(html_content)
|
|||
|
|
|
|||
|
|
product_info = self.extract_product_info(html_content)
|
|||
|
|
if product_info:
|
|||
|
|
self.save_to_file(product_info)
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
logger.error("未能提取产品信息")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
logger.info("开始ProductHunt产品信息抓取")
|
|||
|
|
scraper = ProductHuntScraper()
|
|||
|
|
|
|||
|
|
# 可以修改product_url来抓取其他产品
|
|||
|
|
# scraper.product_url = "https://www.producthunt.com/products/your-product"
|
|||
|
|
|
|||
|
|
success = scraper.scrape_product()
|
|||
|
|
|
|||
|
|
if success:
|
|||
|
|
logger.info("产品信息抓取完成")
|
|||
|
|
else:
|
|||
|
|
logger.error("产品信息抓取失败")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|