tophux_scrape/product/new_data_advanced.py

import os
import json
import time
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from loguru import logger

class ProductHuntScraper:
    def __init__(self):
        self.session = requests.Session()
        self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"

        # 设置更复杂的请求头，模拟真实浏览器
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Cache-Control': 'max-age=0',
            'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }

    def get_page_content(self):
        """获取页面内容"""
        try:
            logger.info(f"正在获取页面内容: {self.product_url}")

            # 首先访问主页
            logger.info("首先访问ProductHunt主页")
            main_page = self.session.get("https://www.producthunt.com/", headers=self.headers)
            logger.info(f"主页状态码: {main_page.status_code}")

            # 等待一下模拟人类行为
            time.sleep(2)

            # 然后访问产品页面
            response = self.session.get(self.product_url, headers=self.headers)

            # 检查响应状态码
            if response.status_code == 200:
                logger.info("成功获取页面内容")
                return response.text
            else:
                logger.error(f"获取页面失败，状态码: {response.status_code}")
                logger.info(f"响应头: {response.headers}")
                return None

        except Exception as e:
            logger.error(f"获取页面内容失败: {str(e)}")
            return None

    def extract_product_info(self, html_content):
        """从HTML内容中提取产品信息"""
        try:
            logger.info("开始解析HTML内容")
            soup = BeautifulSoup(html_content, 'html.parser')

            product_info = {
                "url": self.product_url,
                "scraped_at": datetime.now().isoformat()
            }

            # 提取产品名称 (h1标签)
            try:
                name_element = soup.find('h1')
                if name_element:
                    product_info["name"] = name_element.get_text(strip=True)
                    logger.info(f"产品名称: {product_info['name']}")
                else:
                    logger.warning("未找到产品名称 (h1标签)")
                    product_info["name"] = "未找到"
            except Exception as e:
                logger.warning(f"提取产品名称时出错: {str(e)}")
                product_info["name"] = "未找到"

            # 提取产品简介 - 尝试多种可能的CSS选择器
            desc_selectors = [
                "div.relative.text-16.font-normal.text-gray-700",
                ".text-16.font-normal.text-gray-700",
                "[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
                "div[class*='description']",
                ".product-description",
                "div[class*='tagline']",
                "div[class*='subtitle']",
                "p[class*='text-gray']",
                "div[class*='mb-4']"
            ]

            for selector in desc_selectors:
                try:
                    desc_element = soup.select_one(selector)
                    if desc_element and desc_element.get_text(strip=True):
                        product_info["description"] = desc_element.get_text(strip=True)
                        logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
                        break
                except Exception as e:
                    logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")

            if "description" not in product_info:
                logger.warning("未找到产品简介")
                product_info["description"] = "未找到"

            # 提取第一个评论 - 尝试多种可能的CSS选择器
            comment_selectors = [
                "div.flex.flex-1.flex-col.gap-2",
                ".flex.flex-1.flex-col.gap-2",
                "[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
                "div[class*='comment']",
                ".comment-text",
                "div[class*='review']",
                "div[class*='feedback']",
                "blockquote",
                "div[class*='border']"
            ]

            for selector in comment_selectors:
                try:
                    comment_element = soup.select_one(selector)
                    if comment_element and comment_element.get_text(strip=True):
                        product_info["first_comment"] = comment_element.get_text(strip=True)
                        logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
                        break
                except Exception as e:
                    logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")

            if "first_comment" not in product_info:
                logger.warning("未找到第一个评论")
                product_info["first_comment"] = "未找到"

            # 尝试提取其他有用信息
            try:
                # 尝试获取产品标签
                tag_elements = soup.select("[class*='tag'], [class*='category'], [class*='topic']")
                if tag_elements:
                    tags = [tag.get_text(strip=True) for tag in tag_elements if tag.get_text(strip=True)]
                    product_info["tags"] = tags[:5]  # 最多取5个标签
                    logger.info(f"找到标签: {tags[:3]}")
            except Exception as e:
                logger.debug(f"提取标签时出错: {str(e)}")

            return product_info

        except Exception as e:
            logger.error(f"解析HTML内容失败: {str(e)}")
            return None

    def save_to_file(self, data, filename="product_info.json"):
        """保存数据到文件"""
        try:
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            logger.info(f"数据已保存到 {filename}")
            return True
        except Exception as e:
            logger.error(f"保存数据失败: {str(e)}")
            return False

    def save_html(self, html_content, filename="product_page.html"):
        """保存HTML内容到文件，用于调试"""
        try:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(html_content)
            logger.info(f"HTML内容已保存到 {filename}")
            return True
        except Exception as e:
            logger.error(f"保存HTML内容失败: {str(e)}")
            return False

    def scrape_product(self):
        """执行完整的抓取流程"""
        html_content = self.get_page_content()
        if not html_content:
            logger.error("无法获取页面内容")
            return False

        # 保存HTML内容用于调试
        self.save_html(html_content)

        product_info = self.extract_product_info(html_content)
        if product_info:
            self.save_to_file(product_info)
            return True
        else:
            logger.error("未能提取产品信息")
            return False

def main():
    logger.info("开始ProductHunt产品信息抓取")
    scraper = ProductHuntScraper()

    # 可以修改product_url来抓取其他产品
    # scraper.product_url = "https://www.producthunt.com/products/your-product"

    success = scraper.scrape_product()

    if success:
        logger.info("产品信息抓取完成")
    else:
        logger.error("产品信息抓取失败")

if __name__ == "__main__":
    main()