Files
tophux_scrape/product/new_data_playwright.py

232 lines
9.1 KiB
Python
Raw Normal View History

import os
import json
import time
from datetime import datetime
from playwright.sync_api import sync_playwright
from loguru import logger
class ProductHuntScraper:
def __init__(self):
self.browser = None
self.context = None
self.page = None
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
def connect_to_browser(self):
"""连接到浏览器"""
try:
logger.info("正在初始化Playwright浏览器...")
# 启动Playwright
self.playwright = sync_playwright().start()
# 启动Chromium浏览器
self.browser = self.playwright.chromium.launch(
headless=False, # 设置为False以便观察浏览器行为
args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--window-size=1920,1080"
]
)
# 创建浏览器上下文
self.context = self.browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={"width": 1920, "height": 1080}
)
# 创建新页面
self.page = self.context.new_page()
logger.info("成功连接到Playwright浏览器")
return True
except Exception as e:
logger.error(f"连接浏览器失败: {str(e)}")
return False
def navigate_to_product(self):
"""导航到产品页面"""
try:
logger.info(f"正在导航到产品页面: {self.product_url}")
# 首先访问主页
logger.info("首先访问ProductHunt主页")
self.page.goto("https://www.producthunt.com/", wait_until="domcontentloaded")
time.sleep(3) # 等待页面加载
# 然后访问产品页面
self.page.goto(self.product_url, wait_until="domcontentloaded")
# 等待页面加载
logger.info("等待页面加载...")
time.sleep(10) # 等待动态内容加载
logger.info("页面加载完成")
return True
except Exception as e:
logger.error(f"导航到产品页面失败: {str(e)}")
return False
def extract_product_info(self):
"""提取产品信息"""
try:
logger.info("开始提取产品信息")
product_info = {
"url": self.product_url,
"scraped_at": datetime.now().isoformat()
}
# 提取产品名称 (h1标签)
try:
name_element = self.page.query_selector("h1")
if name_element:
product_info["name"] = name_element.text_content().strip()
logger.info(f"产品名称: {product_info['name']}")
else:
logger.warning("未找到产品名称 (h1标签)")
product_info["name"] = "未找到"
except Exception as e:
logger.warning(f"提取产品名称时出错: {str(e)}")
product_info["name"] = "未找到"
# 提取产品简介 (class为"relative text-16 font-normal text-gray-700"的div)
try:
desc_selector = "div.relative.text-16.font-normal.text-gray-700"
desc_element = self.page.query_selector(desc_selector)
if desc_element:
product_info["description"] = desc_element.text_content().strip()
logger.info(f"产品简介: {product_info['description'][:50]}...")
else:
logger.warning(f"未找到产品简介 ({desc_selector})")
product_info["description"] = "未找到"
except Exception as e:
logger.warning(f"提取产品简介时出错: {str(e)}")
product_info["description"] = "未找到"
# 提取第一个评论 (class为"flex flex-1 flex-col gap-2"的div)
try:
comment_selector = "div.flex.flex-1.flex-col.gap-2"
comment_element = self.page.query_selector(comment_selector)
if comment_element:
product_info["first_comment"] = comment_element.text_content().strip()
logger.info(f"第一个评论: {product_info['first_comment'][:50]}...")
else:
logger.warning(f"未找到第一个评论 ({comment_selector})")
product_info["first_comment"] = "未找到"
except Exception as e:
logger.warning(f"提取第一个评论时出错: {str(e)}")
product_info["first_comment"] = "未找到"
# 尝试提取其他有用信息
try:
# 尝试获取产品标签
tag_elements = self.page.query_selector_all("[class*='tag'], [class*='category'], [class*='topic']")
if tag_elements:
tags = [tag.text_content().strip() for tag in tag_elements if tag.text_content().strip()]
product_info["tags"] = tags[:5] # 最多取5个标签
logger.info(f"找到标签: {tags[:3]}")
except Exception as e:
logger.debug(f"提取标签时出错: {str(e)}")
# 尝试获取点赞数
try:
like_elements = self.page.query_selector_all("[class*='vote'], [class*='like'], [class*='upvote']")
if like_elements:
product_info["likes"] = like_elements[0].text_content().strip()
logger.info(f"点赞数: {product_info['likes']}")
except Exception as e:
logger.debug(f"提取点赞数时出错: {str(e)}")
# 尝试获取评论数
try:
comment_count_elements = self.page.query_selector_all("[class*='comment-count'], [class*='comments']")
if comment_count_elements:
product_info["comment_count"] = comment_count_elements[0].text_content().strip()
logger.info(f"评论数: {product_info['comment_count']}")
except Exception as e:
logger.debug(f"提取评论数时出错: {str(e)}")
return product_info
except Exception as e:
logger.error(f"提取产品信息失败: {str(e)}")
return None
def save_to_file(self, data, filename="product_info.json"):
"""保存数据到文件"""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"数据已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存数据失败: {str(e)}")
return False
def save_screenshot(self, filename="product_screenshot.png"):
"""保存页面截图,用于调试"""
try:
self.page.screenshot(path=filename)
logger.info(f"页面截图已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存页面截图失败: {str(e)}")
return False
def close(self):
"""关闭浏览器"""
try:
if self.context:
self.context.close()
if self.browser:
self.browser.close()
if self.playwright:
self.playwright.stop()
logger.info("浏览器已关闭")
except Exception as e:
logger.error(f"关闭浏览器时出错: {str(e)}")
def scrape_product(self):
"""执行完整的抓取流程"""
if not self.connect_to_browser():
logger.error("无法连接到浏览器")
return False
try:
if not self.navigate_to_product():
logger.error("无法导航到产品页面")
return False
# 保存截图用于调试
self.save_screenshot()
product_info = self.extract_product_info()
if product_info:
self.save_to_file(product_info)
return True
else:
logger.error("未能提取产品信息")
return False
finally:
self.close()
def main():
logger.info("开始ProductHunt产品信息抓取")
scraper = ProductHuntScraper()
# 可以修改product_url来抓取其他产品
# scraper.product_url = "https://www.producthunt.com/products/your-product"
success = scraper.scrape_product()
if success:
logger.info("产品信息抓取完成")
else:
logger.error("产品信息抓取失败")
if __name__ == "__main__":
main()