Files
tophux_scrape/product/new_data_standalone.py

252 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Product Hunt网站数据抓取脚本
使用Selenium启动新的Chrome实例抓取Product Hunt产品信息
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import time
import json
import re
from datetime import datetime
from loguru import logger
# 配置日志
logger.add("producthunt_scraper.log", rotation="10 MB", level="INFO")
class ProductHuntScraper:
"""Product Hunt网站数据抓取器"""
def __init__(self, use_debug_address=False, debug_address="127.0.0.1:5003"):
"""
初始化抓取器
Args:
use_debug_address (bool): 是否使用调试地址连接到现有Chrome实例
debug_address (str): Chrome调试地址默认为"127.0.0.1:5003"
"""
self.use_debug_address = use_debug_address
self.debug_address = debug_address
self.driver = None
self.wait = None
def connect_to_chrome(self):
"""
连接到Chrome实例现有或新建
Returns:
bool: 连接是否成功
"""
try:
options = webdriver.ChromeOptions()
if self.use_debug_address:
# 连接到现有的Chrome实例
logger.info(f"尝试连接到Chrome调试实例: {self.debug_address}")
options.add_experimental_option("debuggerAddress", self.debug_address)
else:
# 启动新的Chrome实例
logger.info("启动新的Chrome实例")
# 添加一些有用的选项
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
# 设置用户代理,模拟真实浏览器
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# 使用webdriver-manager自动管理Chrome驱动程序
service = Service(ChromeDriverManager().install())
self.driver = webdriver.Chrome(service=service, options=options)
self.wait = WebDriverWait(self.driver, 10)
logger.info(f"成功连接到Chrome实例当前页面标题: {self.driver.title}")
return True
except Exception as e:
logger.error(f"连接Chrome实例失败: {e}")
return False
def navigate_to_product(self, product_url):
"""
导航到指定的Product Hunt产品页面
Args:
product_url (str): 产品页面URL
Returns:
bool: 导航是否成功
"""
try:
logger.info(f"正在导航到产品页面: {product_url}")
self.driver.get(product_url)
# 等待页面加载完成
self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(5) # 额外等待,确保动态内容加载
logger.info(f"成功导航到产品页面,当前标题: {self.driver.title}")
return True
except TimeoutException:
logger.error("页面加载超时")
return False
except Exception as e:
logger.error(f"导航到产品页面失败: {e}")
return False
def extract_product_info(self):
"""
从当前页面提取产品信息
Returns:
dict: 包含产品信息的字典
"""
product_info = {}
try:
# 提取产品名称 - h1标签下的字符串
try:
h1_element = self.driver.find_element(By.TAG_NAME, "h1")
product_name = h1_element.text.strip()
logger.info(f"找到产品名称: {product_name}")
product_info['name'] = product_name or "未找到产品名称"
except NoSuchElementException:
logger.error("未找到h1标签")
product_info['name'] = "未找到产品名称"
except Exception as e:
logger.error(f"提取产品名称失败: {e}")
product_info['name'] = "提取失败"
# 提取产品简介 - div的class="relative text-16 font-normal text-gray-700"的字符串
try:
description_element = self.driver.find_element(By.CSS_SELECTOR, "div.relative.text-16.font-normal.text-gray-700")
product_description = description_element.text.strip()
logger.info(f"找到产品简介: {product_description[:50]}...")
product_info['description'] = product_description or "未找到产品简介"
except NoSuchElementException:
logger.error("未找到产品简介div")
product_info['description'] = "未找到产品简介"
except Exception as e:
logger.error(f"提取产品简介失败: {e}")
product_info['description'] = "提取失败"
# 提取第一个评论 - div的class是flex flex-1 flex-col gap-2提取字符串的所有内容
try:
comment_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.flex.flex-1.flex-col.gap-2")
if comment_elements:
first_comment = comment_elements[0].text.strip()
logger.info(f"找到第一个评论: {first_comment[:50]}...")
product_info['first_comment'] = first_comment
else:
logger.warning("未找到任何评论")
product_info['first_comment'] = "未找到评论"
except Exception as e:
logger.error(f"提取第一个评论失败: {e}")
product_info['first_comment'] = "提取失败"
# 添加当前URL和抓取时间
product_info['url'] = self.driver.current_url
product_info['scraped_at'] = datetime.now().isoformat()
return product_info
except Exception as e:
logger.error(f"提取产品信息时出错: {e}")
return {
'name': "提取失败",
'description': "提取失败",
'first_comment': "提取失败",
'url': self.driver.current_url if self.driver else "未知",
'scraped_at': datetime.now().isoformat(),
'error': str(e)
}
def scrape_product(self, product_url):
"""
抓取指定URL的产品信息
Args:
product_url (str): 产品页面URL
Returns:
dict: 产品信息字典
"""
if not self.driver:
if not self.connect_to_chrome():
logger.error("无法连接到Chrome实例")
return None
if not self.navigate_to_product(product_url):
logger.error("无法导航到产品页面")
return None
return self.extract_product_info()
def save_to_file(self, product_info, filename=None):
"""
将产品信息保存到JSON文件
Args:
product_info (dict): 产品信息
filename (str, optional): 文件名。如果未提供,将自动生成
Returns:
str: 保存的文件名
"""
if not filename:
now = datetime.now()
filename = f"product_{now.strftime('%Y%m%d_%H%M%S')}.json"
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(product_info, f, ensure_ascii=False, indent=2)
logger.info(f"产品信息已保存到 {filename}")
return filename
except Exception as e:
logger.error(f"保存文件失败: {e}")
raise
def close(self):
"""
关闭连接
"""
if self.driver:
self.driver.quit()
logger.info("已关闭Chrome连接")
if __name__ == "__main__":
# 示例用法
# 如果您有现有的Chrome调试实例设置use_debug_address=True
scraper = ProductHuntScraper(use_debug_address=False)
try:
# 要抓取的产品URL
product_url = "https://www.producthunt.com/products/elsie-ai-beta"
# 抓取产品信息
product_info = scraper.scrape_product(product_url)
if product_info:
# 打印产品信息
logger.info("抓取到的产品信息:")
logger.info(json.dumps(product_info, ensure_ascii=False, indent=2))
# 保存到文件
filename = scraper.save_to_file(product_info)
logger.info(f"产品信息已保存到: {filename}")
else:
logger.error("未能获取产品信息")
except Exception as e:
logger.error(f"程序执行出错: {e}")
finally:
# 关闭Chrome实例
scraper.close()