Files
tophux_scrape/product/new_data_advanced.py

212 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import json
import time
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from loguru import logger
class ProductHuntScraper:
def __init__(self):
self.session = requests.Session()
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
# 设置更复杂的请求头,模拟真实浏览器
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def get_page_content(self):
"""获取页面内容"""
try:
logger.info(f"正在获取页面内容: {self.product_url}")
# 首先访问主页
logger.info("首先访问ProductHunt主页")
main_page = self.session.get("https://www.producthunt.com/", headers=self.headers)
logger.info(f"主页状态码: {main_page.status_code}")
# 等待一下模拟人类行为
time.sleep(2)
# 然后访问产品页面
response = self.session.get(self.product_url, headers=self.headers)
# 检查响应状态码
if response.status_code == 200:
logger.info("成功获取页面内容")
return response.text
else:
logger.error(f"获取页面失败,状态码: {response.status_code}")
logger.info(f"响应头: {response.headers}")
return None
except Exception as e:
logger.error(f"获取页面内容失败: {str(e)}")
return None
def extract_product_info(self, html_content):
"""从HTML内容中提取产品信息"""
try:
logger.info("开始解析HTML内容")
soup = BeautifulSoup(html_content, 'html.parser')
product_info = {
"url": self.product_url,
"scraped_at": datetime.now().isoformat()
}
# 提取产品名称 (h1标签)
try:
name_element = soup.find('h1')
if name_element:
product_info["name"] = name_element.get_text(strip=True)
logger.info(f"产品名称: {product_info['name']}")
else:
logger.warning("未找到产品名称 (h1标签)")
product_info["name"] = "未找到"
except Exception as e:
logger.warning(f"提取产品名称时出错: {str(e)}")
product_info["name"] = "未找到"
# 提取产品简介 - 尝试多种可能的CSS选择器
desc_selectors = [
"div.relative.text-16.font-normal.text-gray-700",
".text-16.font-normal.text-gray-700",
"[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
"div[class*='description']",
".product-description",
"div[class*='tagline']",
"div[class*='subtitle']",
"p[class*='text-gray']",
"div[class*='mb-4']"
]
for selector in desc_selectors:
try:
desc_element = soup.select_one(selector)
if desc_element and desc_element.get_text(strip=True):
product_info["description"] = desc_element.get_text(strip=True)
logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
break
except Exception as e:
logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")
if "description" not in product_info:
logger.warning("未找到产品简介")
product_info["description"] = "未找到"
# 提取第一个评论 - 尝试多种可能的CSS选择器
comment_selectors = [
"div.flex.flex-1.flex-col.gap-2",
".flex.flex-1.flex-col.gap-2",
"[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
"div[class*='comment']",
".comment-text",
"div[class*='review']",
"div[class*='feedback']",
"blockquote",
"div[class*='border']"
]
for selector in comment_selectors:
try:
comment_element = soup.select_one(selector)
if comment_element and comment_element.get_text(strip=True):
product_info["first_comment"] = comment_element.get_text(strip=True)
logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
break
except Exception as e:
logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")
if "first_comment" not in product_info:
logger.warning("未找到第一个评论")
product_info["first_comment"] = "未找到"
# 尝试提取其他有用信息
try:
# 尝试获取产品标签
tag_elements = soup.select("[class*='tag'], [class*='category'], [class*='topic']")
if tag_elements:
tags = [tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)]
product_info["tags"] = tags[:5] # 最多取5个标签
logger.info(f"找到标签: {tags[:3]}")
except Exception as e:
logger.debug(f"提取标签时出错: {str(e)}")
return product_info
except Exception as e:
logger.error(f"解析HTML内容失败: {str(e)}")
return None
def save_to_file(self, data, filename="product_info.json"):
"""保存数据到文件"""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"数据已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存数据失败: {str(e)}")
return False
def save_html(self, html_content, filename="product_page.html"):
"""保存HTML内容到文件用于调试"""
try:
with open(filename, "w", encoding="utf-8") as f:
f.write(html_content)
logger.info(f"HTML内容已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存HTML内容失败: {str(e)}")
return False
def scrape_product(self):
"""执行完整的抓取流程"""
html_content = self.get_page_content()
if not html_content:
logger.error("无法获取页面内容")
return False
# 保存HTML内容用于调试
self.save_html(html_content)
product_info = self.extract_product_info(html_content)
if product_info:
self.save_to_file(product_info)
return True
else:
logger.error("未能提取产品信息")
return False
def main():
logger.info("开始ProductHunt产品信息抓取")
scraper = ProductHuntScraper()
# 可以修改product_url来抓取其他产品
# scraper.product_url = "https://www.producthunt.com/products/your-product"
success = scraper.scrape_product()
if success:
logger.info("产品信息抓取完成")
else:
logger.error("产品信息抓取失败")
if __name__ == "__main__":
main()