刷新今日的新闻数据
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,7 @@ import asyncio
|
|||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import sys
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
# 配置日志
|
# 配置日志
|
||||||
logger.remove()
|
logger.remove()
|
||||||
@@ -120,33 +121,74 @@ class ProductHuntScraper:
|
|||||||
try:
|
try:
|
||||||
product_info = {}
|
product_info = {}
|
||||||
|
|
||||||
# 提取产品名称
|
# 提取产品名称(h1标签)
|
||||||
name_element = await self.page.query_selector("h1")
|
name_element = await self.page.query_selector("h1")
|
||||||
if name_element:
|
if name_element:
|
||||||
product_info["name"] = await name_element.text_content()
|
product_info["name"] = (await name_element.text_content()).strip()
|
||||||
logger.info(f"产品名称: {product_info['name']}")
|
logger.info(f"产品名称: {product_info['name']}")
|
||||||
|
|
||||||
# 提取产品描述
|
# 提取产品简介(class为"relative text-16 font-normal text-gray-700"的div)
|
||||||
desc_element = await self.page.query_selector("[data-testid='product-description']")
|
logger.info("正在提取产品简介...")
|
||||||
if not desc_element:
|
try:
|
||||||
desc_element = await self.page.query_selector(".styles_description__")
|
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
|
||||||
|
if intro_div:
|
||||||
|
product_info["introduction"] = (await intro_div.text_content()).strip()
|
||||||
|
logger.info(f"产品简介: {product_info['introduction'][:200]}...")
|
||||||
|
else:
|
||||||
|
logger.warning("未找到class为'relative text-16 font-normal text-gray-700'的div")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"提取产品简介失败: {e}")
|
||||||
|
|
||||||
if desc_element:
|
# 等待制作人发言动态加载(等待class="flex flex-col gap-2"的section标签出现)
|
||||||
product_info["description"] = await desc_element.text_content()
|
logger.info("等待制作人发言动态加载...")
|
||||||
logger.info(f"产品描述: {product_info['description'][:100]}...")
|
try:
|
||||||
|
# 等待section标签出现,最长等待60秒
|
||||||
|
section_element = await self.page.wait_for_selector(
|
||||||
|
'section.flex.flex-col.gap-2',
|
||||||
|
timeout=60000
|
||||||
|
)
|
||||||
|
if section_element:
|
||||||
|
logger.success("制作人发言区域已加载")
|
||||||
|
|
||||||
|
# 提取制作人发言(class为"flex flex-col gap-1"的div里面的span标签)
|
||||||
|
maker_div = await section_element.query_selector('div.flex.flex-col.gap-1')
|
||||||
|
if maker_div:
|
||||||
|
span_element = await maker_div.query_selector('span')
|
||||||
|
if span_element:
|
||||||
|
product_info["maker_statement"] = (await span_element.text_content()).strip()
|
||||||
|
logger.info(f"制作人发言: {product_info['maker_statement'][:200]}...")
|
||||||
|
else:
|
||||||
|
logger.warning("在div中未找到span标签")
|
||||||
|
else:
|
||||||
|
logger.warning("未找到class为'flex flex-col gap-1'的div")
|
||||||
|
else:
|
||||||
|
logger.warning("制作人发言区域未加载")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"等待制作人发言加载失败: {e}")
|
||||||
|
|
||||||
# 提取投票数
|
# 提取用户数(class="text-14 font-medium text-gray-700"的p标签)
|
||||||
votes_element = await self.page.query_selector("[data-testid='vote-button']")
|
logger.info("正在提取用户数...")
|
||||||
if votes_element:
|
try:
|
||||||
votes_text = await votes_element.text_content()
|
user_count_element = await self.page.query_selector('p.text-14.font-medium.text-gray-700')
|
||||||
product_info["votes"] = votes_text
|
if user_count_element:
|
||||||
logger.info(f"投票数: {votes_text}")
|
product_info["user_count"] = (await user_count_element.text_content()).strip()
|
||||||
|
logger.info(f"用户数: {product_info['user_count']}")
|
||||||
|
else:
|
||||||
|
logger.warning("未找到class为'text-14 font-medium text-gray-700'的p标签")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"提取用户数失败: {e}")
|
||||||
|
|
||||||
# 提取产品链接
|
# 保存到临时文件
|
||||||
website_element = await self.page.query_selector("a[href*='://']")
|
temp_file_path = "temp_product_info.txt"
|
||||||
if website_element:
|
with open(temp_file_path, "w", encoding="utf-8") as f:
|
||||||
product_info["website"] = await website_element.get_attribute("href")
|
f.write("=== Product Hunt 产品信息 ===\n\n")
|
||||||
logger.info(f"产品网站: {product_info['website']}")
|
f.write(f"产品名称: {product_info.get('name', '未获取')}\n\n")
|
||||||
|
f.write(f"产品简介: {product_info.get('introduction', '未获取')}\n\n")
|
||||||
|
f.write(f"制作人发言: {product_info.get('maker_statement', '未获取')}\n\n")
|
||||||
|
f.write(f"用户数: {product_info.get('user_count', '未获取')}\n\n")
|
||||||
|
f.write(f"提取时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||||
|
|
||||||
|
logger.info(f"产品信息已保存到临时文件: {temp_file_path}")
|
||||||
|
|
||||||
# 截取页面截图
|
# 截取页面截图
|
||||||
screenshot_path = "product_screenshot.png"
|
screenshot_path = "product_screenshot.png"
|
||||||
|
|||||||
BIN
product_screenshot.png
Normal file
BIN
product_screenshot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 642 KiB |
9
temp_product_info.txt
Normal file
9
temp_product_info.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
=== Product Hunt 产品信息 ===
|
||||||
|
|
||||||
|
产品名称: Notion
|
||||||
|
|
||||||
|
产品简介: Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.
|
||||||
|
|
||||||
|
制作人发言: 未获取
|
||||||
|
|
||||||
|
提取时间: 2025-11-17 22:51:58
|
||||||
File diff suppressed because it is too large
Load Diff
BIN
tophub_data.db
BIN
tophub_data.db
Binary file not shown.
1099
tophub_scraper.log
1099
tophub_scraper.log
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user