更新今天的数据
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -22,6 +22,8 @@ class ProductHuntScraper:
|
|||||||
self.debug_port = debug_port
|
self.debug_port = debug_port
|
||||||
self.browser = None
|
self.browser = None
|
||||||
self.page = None
|
self.page = None
|
||||||
|
self.click_records = [] # 记录点击行为
|
||||||
|
self.dom_selection_records = [] # 记录DOM选取行为
|
||||||
|
|
||||||
async def connect_to_existing_chrome(self):
|
async def connect_to_existing_chrome(self):
|
||||||
"""连接到已运行的Chrome实例"""
|
"""连接到已运行的Chrome实例"""
|
||||||
@@ -58,7 +60,47 @@ class ProductHuntScraper:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"连接Chrome失败: {e}")
|
logger.error(f"连接Chrome失败: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
async def record_click(self, x, y, selector="", description=""):
|
||||||
|
"""记录点击行为"""
|
||||||
|
click_record = {
|
||||||
|
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"type": "click",
|
||||||
|
"x": x,
|
||||||
|
"y": y,
|
||||||
|
"selector": selector,
|
||||||
|
"description": description
|
||||||
|
}
|
||||||
|
self.click_records.append(click_record)
|
||||||
|
logger.info(f"记录点击: {description} - 坐标({x}, {y}) - 选择器: {selector}")
|
||||||
|
|
||||||
|
async def record_dom_selection(self, selector, description=""):
|
||||||
|
"""记录DOM选取行为"""
|
||||||
|
dom_record = {
|
||||||
|
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"type": "dom_selection",
|
||||||
|
"selector": selector,
|
||||||
|
"description": description
|
||||||
|
}
|
||||||
|
self.dom_selection_records.append(dom_record)
|
||||||
|
logger.info(f"记录DOM选取: {description} - 选择器: {selector}")
|
||||||
|
|
||||||
|
async def save_behavior_records(self):
|
||||||
|
"""保存行为记录到文件"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
records = {
|
||||||
|
"click_records": self.click_records,
|
||||||
|
"dom_selection_records": self.dom_selection_records
|
||||||
|
}
|
||||||
|
|
||||||
|
filename = f"playwright_behavior_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||||
|
|
||||||
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(records, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
logger.success(f"行为记录已保存到: {filename}")
|
||||||
|
|
||||||
async def navigate_to_producthunt(self, url):
|
async def navigate_to_producthunt(self, url):
|
||||||
"""导航到ProductHunt页面"""
|
"""导航到ProductHunt页面"""
|
||||||
if not self.page:
|
if not self.page:
|
||||||
@@ -115,6 +157,7 @@ class ProductHuntScraper:
|
|||||||
logger.info("模拟点击制作人链接...")
|
logger.info("模拟点击制作人链接...")
|
||||||
|
|
||||||
# 查找包含制作人信息的div容器(class="flex flex-col gap-1")
|
# 查找包含制作人信息的div容器(class="flex flex-col gap-1")
|
||||||
|
await self.record_dom_selection('div.flex.flex-col.gap-1', "制作人信息容器")
|
||||||
div_container = await self.page.query_selector('div.flex.flex-col.gap-1')
|
div_container = await self.page.query_selector('div.flex.flex-col.gap-1')
|
||||||
if not div_container:
|
if not div_container:
|
||||||
logger.warning("未找到class='flex flex-col gap-1'的div容器,使用备用方法")
|
logger.warning("未找到class='flex flex-col gap-1'的div容器,使用备用方法")
|
||||||
@@ -156,9 +199,17 @@ class ProductHuntScraper:
|
|||||||
|
|
||||||
logger.info(f"点击位置: ({center_x:.1f}, {center_y:.1f})")
|
logger.info(f"点击位置: ({center_x:.1f}, {center_y:.1f})")
|
||||||
|
|
||||||
|
# 记录点击行为
|
||||||
|
await self.record_click(center_x, center_y, 'div.flex.flex-col.gap-1', "制作人链接点击")
|
||||||
|
|
||||||
|
# 先模拟点击,然后监听新窗口打开事件
|
||||||
|
# 添加动态点击效果:先移动到位置,短暂停留,然后点击
|
||||||
|
await self.page.mouse.move(center_x, center_y)
|
||||||
|
await self.page.wait_for_timeout(2000) # 短暂停留2000毫秒,模拟用户移动鼠标
|
||||||
|
|
||||||
# 监听新窗口打开事件
|
# 监听新窗口打开事件
|
||||||
async with self.page.context.expect_page() as new_page_info:
|
async with self.page.context.expect_page() as new_page_info:
|
||||||
# 模拟点击计算出的位置
|
# 执行点击操作
|
||||||
await self.page.mouse.click(center_x, center_y)
|
await self.page.mouse.click(center_x, center_y)
|
||||||
|
|
||||||
# 获取新页面
|
# 获取新页面
|
||||||
@@ -171,11 +222,13 @@ class ProductHuntScraper:
|
|||||||
logger.success("新窗口已加载完成")
|
logger.success("新窗口已加载完成")
|
||||||
|
|
||||||
# 抓取第一个section的tag
|
# 抓取第一个section的tag
|
||||||
|
await self.record_dom_selection('section', "新窗口第一个section标签")
|
||||||
first_section = await new_page.query_selector('section')
|
first_section = await new_page.query_selector('section')
|
||||||
if first_section:
|
if first_section:
|
||||||
logger.success("找到第一个section标签")
|
logger.success("找到第一个section标签")
|
||||||
|
|
||||||
# 在section下面找一个没有任何class的div标签
|
# 在section下面找一个没有任何class的div标签
|
||||||
|
await self.record_dom_selection('div:not([class])', "section下无class的div标签")
|
||||||
div_without_class = await first_section.query_selector('div:not([class])')
|
div_without_class = await first_section.query_selector('div:not([class])')
|
||||||
if div_without_class:
|
if div_without_class:
|
||||||
logger.success("找到无class的div标签")
|
logger.success("找到无class的div标签")
|
||||||
@@ -216,17 +269,20 @@ class ProductHuntScraper:
|
|||||||
new_page = await self.browser.new_page()
|
new_page = await self.browser.new_page()
|
||||||
|
|
||||||
# 导航到制作人页面
|
# 导航到制作人页面
|
||||||
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=3000000)
|
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
|
||||||
# 等待页面加载
|
# 等待页面加载
|
||||||
await new_page.wait_for_timeout(3000)
|
await new_page.wait_for_timeout(15000)
|
||||||
|
logger.info("页面加载等待完成,开始提取内容...")
|
||||||
|
|
||||||
# 抓取第一个section的tag
|
# 抓取第一个section的tag
|
||||||
|
await self.record_dom_selection('section', "备用方法-新窗口第一个section标签")
|
||||||
first_section = await new_page.query_selector('section')
|
first_section = await new_page.query_selector('section')
|
||||||
if first_section:
|
if first_section:
|
||||||
logger.success("找到第一个section标签")
|
logger.success("找到第一个section标签")
|
||||||
|
|
||||||
# 在section下面找一个没有任何class的div标签
|
# 在section下面找一个没有任何class的div标签
|
||||||
|
await self.record_dom_selection('div:not([class])', "备用方法-section下无class的div标签")
|
||||||
div_without_class = await first_section.query_selector('div:not([class])')
|
div_without_class = await first_section.query_selector('div:not([class])')
|
||||||
if div_without_class:
|
if div_without_class:
|
||||||
logger.success("找到无class的div标签")
|
logger.success("找到无class的div标签")
|
||||||
@@ -248,6 +304,10 @@ class ProductHuntScraper:
|
|||||||
result = maker_text
|
result = maker_text
|
||||||
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
|
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
|
||||||
|
|
||||||
|
# 添加充分延迟,确保内容完全加载
|
||||||
|
logger.info("等待内容完全稳定...")
|
||||||
|
await new_page.wait_for_timeout(3000)
|
||||||
|
|
||||||
# 关闭新页面
|
# 关闭新页面
|
||||||
await new_page.close()
|
await new_page.close()
|
||||||
logger.info("新窗口已关闭")
|
logger.info("新窗口已关闭")
|
||||||
@@ -269,6 +329,7 @@ class ProductHuntScraper:
|
|||||||
product_info = {}
|
product_info = {}
|
||||||
|
|
||||||
# 提取产品名称(h1标签)
|
# 提取产品名称(h1标签)
|
||||||
|
await self.record_dom_selection("h1", "产品名称")
|
||||||
name_element = await self.page.query_selector("h1")
|
name_element = await self.page.query_selector("h1")
|
||||||
if name_element:
|
if name_element:
|
||||||
product_info["name"] = (await name_element.text_content()).strip()
|
product_info["name"] = (await name_element.text_content()).strip()
|
||||||
@@ -277,6 +338,7 @@ class ProductHuntScraper:
|
|||||||
# 提取产品简介(class为"relative text-16 font-normal text-gray-700"的div)
|
# 提取产品简介(class为"relative text-16 font-normal text-gray-700"的div)
|
||||||
logger.info("正在提取产品简介...")
|
logger.info("正在提取产品简介...")
|
||||||
try:
|
try:
|
||||||
|
await self.record_dom_selection('div.relative.text-16.font-normal.text-gray-700', "产品简介")
|
||||||
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
|
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
|
||||||
if intro_div:
|
if intro_div:
|
||||||
product_info["introduction"] = (await intro_div.text_content()).strip()
|
product_info["introduction"] = (await intro_div.text_content()).strip()
|
||||||
@@ -290,6 +352,7 @@ class ProductHuntScraper:
|
|||||||
logger.info("等待制作人发言动态加载...")
|
logger.info("等待制作人发言动态加载...")
|
||||||
try:
|
try:
|
||||||
# 等待section标签出现,最长等待60秒
|
# 等待section标签出现,最长等待60秒
|
||||||
|
await self.record_dom_selection('section.flex.flex-col.gap-2', "制作人发言区域")
|
||||||
section_element = await self.page.wait_for_selector(
|
section_element = await self.page.wait_for_selector(
|
||||||
'section.flex.flex-col.gap-2',
|
'section.flex.flex-col.gap-2',
|
||||||
timeout=60000
|
timeout=60000
|
||||||
@@ -298,6 +361,7 @@ class ProductHuntScraper:
|
|||||||
logger.success("制作人发言区域已加载")
|
logger.success("制作人发言区域已加载")
|
||||||
|
|
||||||
# 查找section标签下面的第一个a标签
|
# 查找section标签下面的第一个a标签
|
||||||
|
await self.record_dom_selection('a', "制作人链接")
|
||||||
a_element = await section_element.query_selector('a')
|
a_element = await section_element.query_selector('a')
|
||||||
if a_element:
|
if a_element:
|
||||||
# 提取a标签的文本内容
|
# 提取a标签的文本内容
|
||||||
@@ -412,6 +476,10 @@ async def main():
|
|||||||
with open("product_info.json", "w", encoding="utf-8") as f:
|
with open("product_info.json", "w", encoding="utf-8") as f:
|
||||||
json.dump(product_info, f, ensure_ascii=False, indent=2)
|
json.dump(product_info, f, ensure_ascii=False, indent=2)
|
||||||
logger.info("产品信息已保存到 product_info.json")
|
logger.info("产品信息已保存到 product_info.json")
|
||||||
|
|
||||||
|
# 保存点击和DOM选取行为记录
|
||||||
|
await scraper.save_behavior_records()
|
||||||
|
logger.info("行为记录已保存到 behavior_records.json")
|
||||||
else:
|
else:
|
||||||
logger.warning("未能提取到产品信息")
|
logger.warning("未能提取到产品信息")
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,5 @@
|
|||||||
"name": "Notion",
|
"name": "Notion",
|
||||||
"introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
|
"introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
|
||||||
"maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
|
"maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
|
||||||
"maker_statement": "AI Meeting Notes by Notion",
|
"maker_statement": "AI Meeting Notes by Notion"
|
||||||
"user_count": "15K followers"
|
|
||||||
}
|
}
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 631 KiB After Width: | Height: | Size: 804 KiB |
@@ -6,6 +6,6 @@
|
|||||||
|
|
||||||
制作人发言: AI Meeting Notes by Notion
|
制作人发言: AI Meeting Notes by Notion
|
||||||
|
|
||||||
用户数: 15K followers
|
用户数: 未获取
|
||||||
|
|
||||||
提取时间: 2025-11-18 22:38:38
|
提取时间: 2025-11-19 22:46:52
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
BIN
tophub_data.db
BIN
tophub_data.db
Binary file not shown.
1100
tophub_scraper.log
1100
tophub_scraper.log
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user