Files
tophux_scrape/product/new_data_requests.py

176 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import json
import time
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from loguru import logger
class ProductHuntScraper:
def __init__(self):
self.session = requests.Session()
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
# 设置请求头,模拟浏览器访问
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.producthunt.com/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
def get_page_content(self):
"""获取页面内容"""
try:
logger.info(f"正在获取页面内容: {self.product_url}")
response = self.session.get(self.product_url, headers=self.headers)
# 检查响应状态码
if response.status_code == 200:
logger.info("成功获取页面内容")
return response.text
else:
logger.error(f"获取页面失败,状态码: {response.status_code}")
return None
except Exception as e:
logger.error(f"获取页面内容失败: {str(e)}")
return None
def extract_product_info(self, html_content):
"""从HTML内容中提取产品信息"""
try:
logger.info("开始解析HTML内容")
soup = BeautifulSoup(html_content, 'html.parser')
product_info = {
"url": self.product_url,
"scraped_at": datetime.now().isoformat()
}
# 提取产品名称 (h1标签)
try:
name_element = soup.find('h1')
if name_element:
product_info["name"] = name_element.get_text(strip=True)
logger.info(f"产品名称: {product_info['name']}")
else:
logger.warning("未找到产品名称 (h1标签)")
product_info["name"] = "未找到"
except Exception as e:
logger.warning(f"提取产品名称时出错: {str(e)}")
product_info["name"] = "未找到"
# 提取产品简介 - 尝试多种可能的CSS选择器
desc_selectors = [
"div.relative.text-16.font-normal.text-gray-700",
".text-16.font-normal.text-gray-700",
"[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
"div[class*='description']",
".product-description",
"div[class*='tagline']"
]
for selector in desc_selectors:
try:
desc_element = soup.select_one(selector)
if desc_element:
product_info["description"] = desc_element.get_text(strip=True)
logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
break
except Exception as e:
logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")
if "description" not in product_info:
logger.warning("未找到产品简介")
product_info["description"] = "未找到"
# 提取第一个评论 - 尝试多种可能的CSS选择器
comment_selectors = [
"div.flex.flex-1.flex-col.gap-2",
".flex.flex-1.flex-col.gap-2",
"[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
"div[class*='comment']",
".comment-text",
"div[class*='review']"
]
for selector in comment_selectors:
try:
comment_element = soup.select_one(selector)
if comment_element:
product_info["first_comment"] = comment_element.get_text(strip=True)
logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
break
except Exception as e:
logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")
if "first_comment" not in product_info:
logger.warning("未找到第一个评论")
product_info["first_comment"] = "未找到"
return product_info
except Exception as e:
logger.error(f"解析HTML内容失败: {str(e)}")
return None
def save_to_file(self, data, filename="product_info.json"):
"""保存数据到文件"""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"数据已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存数据失败: {str(e)}")
return False
def save_html(self, html_content, filename="product_page.html"):
"""保存HTML内容到文件用于调试"""
try:
with open(filename, "w", encoding="utf-8") as f:
f.write(html_content)
logger.info(f"HTML内容已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存HTML内容失败: {str(e)}")
return False
def scrape_product(self):
"""执行完整的抓取流程"""
html_content = self.get_page_content()
if not html_content:
logger.error("无法获取页面内容")
return False
# 保存HTML内容用于调试
self.save_html(html_content)
product_info = self.extract_product_info(html_content)
if product_info:
self.save_to_file(product_info)
return True
else:
logger.error("未能提取产品信息")
return False
def main():
logger.info("开始ProductHunt产品信息抓取")
scraper = ProductHuntScraper()
# 可以修改product_url来抓取其他产品
# scraper.product_url = "https://www.producthunt.com/products/your-product"
success = scraper.scrape_product()
if success:
logger.info("产品信息抓取完成")
else:
logger.error("产品信息抓取失败")
if __name__ == "__main__":
main()