刷新今日的新闻数据

This commit is contained in:
2025-11-18 08:07:31 +08:00
parent 74dfa978cf
commit 1da5501e55
7 changed files with 9204 additions and 2329 deletions

View File

@@ -8,6 +8,7 @@ import asyncio
from playwright.async_api import async_playwright
from loguru import logger
import sys
from datetime import datetime
# 配置日志
logger.remove()
@@ -120,33 +121,74 @@ class ProductHuntScraper:
try:
product_info = {}
# 提取产品名称
# 提取产品名称h1标签
name_element = await self.page.query_selector("h1")
if name_element:
product_info["name"] = await name_element.text_content()
product_info["name"] = (await name_element.text_content()).strip()
logger.info(f"产品名称: {product_info['name']}")
# 提取产品描述
desc_element = await self.page.query_selector("[data-testid='product-description']")
if not desc_element:
desc_element = await self.page.query_selector(".styles_description__")
# 提取产品简介class为"relative text-16 font-normal text-gray-700"的div
logger.info("正在提取产品简介...")
try:
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
if intro_div:
product_info["introduction"] = (await intro_div.text_content()).strip()
logger.info(f"产品简介: {product_info['introduction'][:200]}...")
else:
logger.warning("未找到class为'relative text-16 font-normal text-gray-700'的div")
except Exception as e:
logger.error(f"提取产品简介失败: {e}")
if desc_element:
product_info["description"] = await desc_element.text_content()
logger.info(f"产品描述: {product_info['description'][:100]}...")
# 等待制作人发言动态加载等待class="flex flex-col gap-2"的section标签出现
logger.info("等待制作人发言动态加载...")
try:
# 等待section标签出现最长等待60秒
section_element = await self.page.wait_for_selector(
'section.flex.flex-col.gap-2',
timeout=60000
)
if section_element:
logger.success("制作人发言区域已加载")
# 提取制作人发言class为"flex flex-col gap-1"的div里面的span标签
maker_div = await section_element.query_selector('div.flex.flex-col.gap-1')
if maker_div:
span_element = await maker_div.query_selector('span')
if span_element:
product_info["maker_statement"] = (await span_element.text_content()).strip()
logger.info(f"制作人发言: {product_info['maker_statement'][:200]}...")
else:
logger.warning("在div中未找到span标签")
else:
logger.warning("未找到class为'flex flex-col gap-1'的div")
else:
logger.warning("制作人发言区域未加载")
except Exception as e:
logger.error(f"等待制作人发言加载失败: {e}")
# 提取投票数
votes_element = await self.page.query_selector("[data-testid='vote-button']")
if votes_element:
votes_text = await votes_element.text_content()
product_info["votes"] = votes_text
logger.info(f"投票数: {votes_text}")
# 提取用户数class="text-14 font-medium text-gray-700"的p标签
logger.info("正在提取用户数...")
try:
user_count_element = await self.page.query_selector('p.text-14.font-medium.text-gray-700')
if user_count_element:
product_info["user_count"] = (await user_count_element.text_content()).strip()
logger.info(f"用户数: {product_info['user_count']}")
else:
logger.warning("未找到class为'text-14 font-medium text-gray-700'的p标签")
except Exception as e:
logger.error(f"提取用户数失败: {e}")
# 提取产品链接
website_element = await self.page.query_selector("a[href*='://']")
if website_element:
product_info["website"] = await website_element.get_attribute("href")
logger.info(f"产品网站: {product_info['website']}")
# 保存到临时文件
temp_file_path = "temp_product_info.txt"
with open(temp_file_path, "w", encoding="utf-8") as f:
f.write("=== Product Hunt 产品信息 ===\n\n")
f.write(f"产品名称: {product_info.get('name', '未获取')}\n\n")
f.write(f"产品简介: {product_info.get('introduction', '未获取')}\n\n")
f.write(f"制作人发言: {product_info.get('maker_statement', '未获取')}\n\n")
f.write(f"用户数: {product_info.get('user_count', '未获取')}\n\n")
f.write(f"提取时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
logger.info(f"产品信息已保存到临时文件: {temp_file_path}")
# 截取页面截图
screenshot_path = "product_screenshot.png"