今天数据更新
This commit is contained in:
File diff suppressed because it is too large
Load Diff
2596
db_modify.log
2596
db_modify.log
File diff suppressed because it is too large
Load Diff
@@ -1,6 +0,0 @@
|
||||
2025-11-07 23:49:35.277 | INFO | __main__:modify_database_structure:44 - 正在添加score字段...
|
||||
2025-11-07 23:49:35.281 | INFO | __main__:modify_database_structure:48 - 正在转换is_interested数据到score字段...
|
||||
2025-11-07 23:49:35.288 | INFO | __main__:modify_database_structure:63 - 成功添加score字段并转换数据
|
||||
2025-11-07 23:49:35.289 | INFO | __main__:modify_database_structure:71 - 验证成功:score字段已添加到articles表
|
||||
2025-11-07 23:49:35.289 | INFO | __main__:modify_database_structure:84 - 数据转换结果: score=7的记录数: 1, score=5的记录数: 1196
|
||||
2025-11-07 23:49:35.290 | INFO | __main__:<module>:99 - 数据库结构修改完成
|
||||
@@ -1,79 +0,0 @@
|
||||
import sys
|
||||
import requests
|
||||
import json
|
||||
from PySide6.QtWidgets import QApplication, QMainWindow, QListWidget, QVBoxLayout, QWidget, QLabel, QPushButton
|
||||
from PySide6.QtCore import Qt
|
||||
from loguru import logger
|
||||
|
||||
class OllamaModelViewer(QMainWindow):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.setWindowTitle("Ollama 模型查看器")
|
||||
self.setGeometry(100, 100, 600, 400)
|
||||
|
||||
# 创建主窗口部件
|
||||
self.central_widget = QWidget()
|
||||
self.setCentralWidget(self.central_widget)
|
||||
|
||||
# 创建布局
|
||||
self.layout = QVBoxLayout()
|
||||
self.central_widget.setLayout(self.layout)
|
||||
|
||||
# 创建标题标签
|
||||
self.title_label = QLabel("当前安装的Ollama模型:")
|
||||
self.title_label.setStyleSheet("font-weight: bold; font-size: 14px;")
|
||||
self.layout.addWidget(self.title_label)
|
||||
|
||||
# 创建列表部件
|
||||
self.model_list = QListWidget()
|
||||
self.model_list.setStyleSheet("font-family: monospace;")
|
||||
self.layout.addWidget(self.model_list)
|
||||
|
||||
# 创建刷新按钮
|
||||
self.refresh_button = QPushButton("刷新模型列表")
|
||||
self.refresh_button.clicked.connect(self.fetch_models)
|
||||
self.layout.addWidget(self.refresh_button)
|
||||
|
||||
# 初始加载模型
|
||||
self.fetch_models()
|
||||
|
||||
def fetch_models(self):
|
||||
"""从Ollama API获取模型列表"""
|
||||
self.model_list.clear()
|
||||
|
||||
try:
|
||||
logger.info("正在获取Ollama模型列表...")
|
||||
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
models = data.get("models", [])
|
||||
|
||||
if models:
|
||||
for model in models:
|
||||
model_name = model.get("model", "")
|
||||
if model_name:
|
||||
self.model_list.addItem(model_name)
|
||||
logger.info(f"找到模型: {model_name}")
|
||||
else:
|
||||
self.model_list.addItem("未找到任何模型")
|
||||
logger.info("未找到任何模型")
|
||||
else:
|
||||
self.model_list.addItem(f"API请求失败,状态码: {response.status_code}")
|
||||
logger.error(f"API请求失败,状态码: {response.status_code}")
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.model_list.addItem("无法连接到Ollama API")
|
||||
logger.error(f"无法连接到Ollama API: {str(e)}")
|
||||
except json.JSONDecodeError as e:
|
||||
self.model_list.addItem("API响应格式错误")
|
||||
logger.error(f"API响应格式错误: {str(e)}")
|
||||
except Exception as e:
|
||||
self.model_list.addItem(f"发生错误: {str(e)}")
|
||||
logger.error(f"发生未知错误: {str(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = QApplication(sys.argv)
|
||||
window = OllamaModelViewer()
|
||||
window.show()
|
||||
sys.exit(app.exec())
|
||||
44
playwright_behavior_records_20251120_222248.json
Normal file
44
playwright_behavior_records_20251120_222248.json
Normal file
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"click_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:22:15",
|
||||
"type": "click",
|
||||
"x": "制作人链接",
|
||||
"y": "点击制作人链接打开新窗口",
|
||||
"selector": "",
|
||||
"description": ""
|
||||
}
|
||||
],
|
||||
"dom_selection_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:21:55",
|
||||
"type": "dom_selection",
|
||||
"selector": "//h1",
|
||||
"description": "产品名称"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:21:55",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"relative text-16 font-normal text-gray-700\"]//div",
|
||||
"description": "产品简介"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:21:55",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p",
|
||||
"description": "用户数"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:22:15",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]",
|
||||
"description": "制作人span标签"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:22:15",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]/parent::a",
|
||||
"description": "制作人链接"
|
||||
}
|
||||
]
|
||||
}
|
||||
44
playwright_behavior_records_20251120_224303.json
Normal file
44
playwright_behavior_records_20251120_224303.json
Normal file
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"click_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:42:29",
|
||||
"type": "click",
|
||||
"x": "制作人链接",
|
||||
"y": "点击制作人链接打开新窗口",
|
||||
"selector": "",
|
||||
"description": ""
|
||||
}
|
||||
],
|
||||
"dom_selection_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:42:09",
|
||||
"type": "dom_selection",
|
||||
"selector": "//h1",
|
||||
"description": "产品名称"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:42:09",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"relative text-16 font-normal text-gray-700\"]//div",
|
||||
"description": "产品简介"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:42:09",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p",
|
||||
"description": "用户数"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:42:29",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]",
|
||||
"description": "制作人span标签"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:42:29",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]/parent::a",
|
||||
"description": "制作人链接"
|
||||
}
|
||||
]
|
||||
}
|
||||
44
playwright_behavior_records_20251120_224816.json
Normal file
44
playwright_behavior_records_20251120_224816.json
Normal file
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"click_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:46:35",
|
||||
"type": "click",
|
||||
"x": "制作人链接",
|
||||
"y": "点击制作人链接在当前窗口打开",
|
||||
"selector": "",
|
||||
"description": ""
|
||||
}
|
||||
],
|
||||
"dom_selection_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:46:15",
|
||||
"type": "dom_selection",
|
||||
"selector": "//h1",
|
||||
"description": "产品名称"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:46:15",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"relative text-16 font-normal text-gray-700\"]//div",
|
||||
"description": "产品简介"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:46:15",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p",
|
||||
"description": "用户数"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:46:35",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]",
|
||||
"description": "制作人span标签"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:46:35",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]/parent::a",
|
||||
"description": "制作人链接"
|
||||
}
|
||||
]
|
||||
}
|
||||
44
playwright_behavior_records_20251120_225835.json
Normal file
44
playwright_behavior_records_20251120_225835.json
Normal file
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"click_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:55:58",
|
||||
"type": "click",
|
||||
"x": "制作人链接",
|
||||
"y": "点击制作人链接在当前窗口打开",
|
||||
"selector": "",
|
||||
"description": ""
|
||||
}
|
||||
],
|
||||
"dom_selection_records": [
|
||||
{
|
||||
"timestamp": "2025-11-20 22:55:37",
|
||||
"type": "dom_selection",
|
||||
"selector": "//h1",
|
||||
"description": "产品名称"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:55:37",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"relative text-16 font-normal text-gray-700\"]//div",
|
||||
"description": "产品简介"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:55:37",
|
||||
"type": "dom_selection",
|
||||
"selector": "//*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p",
|
||||
"description": "用户数"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:55:58",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]",
|
||||
"description": "制作人span标签"
|
||||
},
|
||||
{
|
||||
"timestamp": "2025-11-20 22:55:58",
|
||||
"type": "dom_selection",
|
||||
"selector": "//span[contains(@class, \"absolute\")]/parent::a",
|
||||
"description": "制作人链接"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -151,115 +151,74 @@ class ProductHuntScraper:
|
||||
logger.error(f"访问页面失败: {e}")
|
||||
return False
|
||||
|
||||
async def extract_maker_statement_from_new_window(self, maker_link, maker_text):
|
||||
"""模拟点击链接在新窗口中提取制作人发言内容"""
|
||||
async def extract_maker_statement_from_current_window(self, maker_link, maker_text):
|
||||
"""在当前窗口中提取制作人发言"""
|
||||
if not maker_link:
|
||||
logger.warning("制作人链接为空")
|
||||
return ""
|
||||
|
||||
if not self.page:
|
||||
logger.error("当前页面未初始化")
|
||||
return ""
|
||||
|
||||
try:
|
||||
logger.info("模拟点击制作人链接...")
|
||||
# 记录点击制作人链接的行为
|
||||
await self.record_click("制作人链接", "点击制作人链接在当前窗口打开")
|
||||
|
||||
# 查找包含制作人信息的div容器(class="flex flex-col gap-1")
|
||||
await self.record_dom_selection('div.flex.flex-col.gap-1', "制作人信息容器")
|
||||
div_container = await self.page.query_selector('div.flex.flex-col.gap-1')
|
||||
if not div_container:
|
||||
logger.warning("未找到class='flex flex-col gap-1'的div容器,使用备用方法")
|
||||
# 备用方法:直接打开新窗口
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
# 在当前页面导航到制作人链接
|
||||
logger.info(f"正在在当前窗口打开制作人链接: {maker_link}")
|
||||
await self.page.goto(maker_link, wait_until="domcontentloaded")
|
||||
|
||||
# 获取div容器的边界框,用于点击中间位置
|
||||
bbox = await div_container.bounding_box()
|
||||
if not bbox:
|
||||
logger.warning("无法获取div容器边界框,使用备用方法")
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
# 等待页面加载
|
||||
await self.page.wait_for_load_state("networkidle")
|
||||
|
||||
# 计算div容器中前面几个元素的高度总和
|
||||
# 获取div容器内的所有子元素
|
||||
child_elements = await div_container.query_selector_all('*')
|
||||
# 等待title元素出现并包含产品名称(最长等待2分钟)
|
||||
logger.info("等待title元素出现并包含产品名称(最长等待2分钟)...")
|
||||
try:
|
||||
# 等待title元素出现,最长等待2分钟
|
||||
await self.page.wait_for_selector("title", timeout=120000)
|
||||
|
||||
# 计算前面几个元素的高度总和
|
||||
total_height = 0
|
||||
element_count = 0
|
||||
max_elements = 3 # 考虑前面3个元素的高度
|
||||
# 检查title是否包含产品名称
|
||||
title_text = await self.page.title()
|
||||
logger.info(f"页面标题: {title_text}")
|
||||
|
||||
for child in child_elements[:max_elements]:
|
||||
child_bbox = await child.bounding_box()
|
||||
if child_bbox:
|
||||
total_height += child_bbox['height']
|
||||
element_count += 1
|
||||
logger.debug(f"元素{element_count}高度: {child_bbox['height']:.1f}px")
|
||||
# 获取产品名称(从maker_text参数中获取)
|
||||
product_name = maker_text.strip() if maker_text else ""
|
||||
|
||||
# 如果无法获取子元素高度,使用div容器高度的一半
|
||||
if total_height == 0:
|
||||
center_y = bbox['y'] + bbox['height'] / 2
|
||||
logger.info("使用div容器高度的一半作为点击位置")
|
||||
else:
|
||||
# 计算点击位置:div容器的y坐标 + 前面元素高度总和
|
||||
center_y = bbox['y'] + total_height
|
||||
logger.info(f"使用前面{element_count}个元素高度总和作为点击位置")
|
||||
|
||||
center_x = bbox['x'] + bbox['width'] / 2
|
||||
|
||||
logger.info(f"点击位置: ({center_x:.1f}, {center_y:.1f})")
|
||||
|
||||
# 记录点击行为
|
||||
await self.record_click(center_x, center_y, 'div.flex.flex-col.gap-1', "制作人链接点击")
|
||||
|
||||
# 先模拟点击,然后监听新窗口打开事件
|
||||
# 添加动态点击效果:先移动到位置,短暂停留,然后点击
|
||||
await self.page.mouse.move(center_x, center_y)
|
||||
await self.page.wait_for_timeout(2000) # 短暂停留2000毫秒,模拟用户移动鼠标
|
||||
|
||||
# 监听新窗口打开事件
|
||||
async with self.page.context.expect_page() as new_page_info:
|
||||
# 执行点击操作
|
||||
await self.page.mouse.click(center_x, center_y)
|
||||
|
||||
# 获取新页面
|
||||
new_page = await new_page_info.value
|
||||
|
||||
# 等待新页面加载完成
|
||||
await new_page.wait_for_load_state("domcontentloaded")
|
||||
await new_page.wait_for_timeout(5000) # 额外等待2秒确保内容加载
|
||||
|
||||
logger.success("新窗口已加载完成")
|
||||
|
||||
# 抓取第一个section的tag
|
||||
await self.record_dom_selection('section', "新窗口第一个section标签")
|
||||
first_section = await new_page.query_selector('section')
|
||||
if first_section:
|
||||
logger.success("找到第一个section标签")
|
||||
|
||||
# 在section下面找一个没有任何class的div标签
|
||||
await self.record_dom_selection('div:not([class])', "section下无class的div标签")
|
||||
div_without_class = await first_section.query_selector('div:not([class])')
|
||||
if div_without_class:
|
||||
logger.success("找到无class的div标签")
|
||||
|
||||
# 提取div及其子标签的所有文本内容
|
||||
maker_statement = await div_without_class.inner_text()
|
||||
result = maker_statement.strip()
|
||||
|
||||
logger.info(f"制作人发言(新窗口): {result[:2000]}...")
|
||||
if product_name and product_name.lower() in title_text.lower():
|
||||
logger.success(f"标题包含产品名称: {product_name}")
|
||||
else:
|
||||
logger.warning("未找到无class的div标签")
|
||||
# 回退到提取section的文本内容
|
||||
section_text = await first_section.inner_text()
|
||||
result = section_text.strip()
|
||||
logger.info(f"制作人发言(回退section): {result[:200]}...")
|
||||
else:
|
||||
logger.warning("未找到section标签")
|
||||
# 回退到原始a标签文本
|
||||
result = maker_text
|
||||
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
|
||||
logger.warning(f"标题不包含产品名称,产品名称: {product_name}")
|
||||
|
||||
# 关闭新页面
|
||||
await new_page.close()
|
||||
logger.info("新窗口已关闭")
|
||||
except Exception as e:
|
||||
logger.error(f"等待title元素失败: {e}")
|
||||
|
||||
return result
|
||||
# 再等待30秒,确保页面完全加载
|
||||
logger.info("再等待30秒,确保页面完全加载...")
|
||||
await self.page.wait_for_timeout(30000) # 等待30秒
|
||||
|
||||
except Exception as new_page_error:
|
||||
logger.error(f"模拟点击操作失败: {new_page_error}")
|
||||
# 如果模拟点击失败,使用备用方法
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
# 提取制作人评论内容(XPath: //*[@id=\"comment-4597755\"]/div/div[2]/div/div/div)
|
||||
logger.info("正在提取制作人评论内容...")
|
||||
try:
|
||||
# 使用XPath查找评论元素
|
||||
comment_element = await self.page.query_selector(
|
||||
'xpath=//*[@id="comment-4597755"]/div/div[2]/div/div/div'
|
||||
)
|
||||
if comment_element:
|
||||
maker_statement = (await comment_element.text_content()).strip()
|
||||
logger.info(f"制作人评论内容: {maker_statement[:200]}...")
|
||||
|
||||
return maker_statement
|
||||
else:
|
||||
logger.warning("未找到XPath为//*[@id=\"comment-4597755\"]/div/div[2]/div/div/div的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取制作人评论内容失败: {e}")
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"在当前窗口打开制作人链接失败: {e}")
|
||||
return ""
|
||||
|
||||
async def _extract_maker_statement_direct_open(self, maker_link, maker_text):
|
||||
"""备用方法:直接在新窗口中打开链接"""
|
||||
@@ -328,41 +287,63 @@ class ProductHuntScraper:
|
||||
try:
|
||||
product_info = {}
|
||||
|
||||
# 提取产品名称(h1标签)
|
||||
await self.record_dom_selection("h1", "产品名称")
|
||||
name_element = await self.page.query_selector("h1")
|
||||
if name_element:
|
||||
product_info["name"] = (await name_element.text_content()).strip()
|
||||
logger.info(f"产品名称: {product_info['name']}")
|
||||
# 提取产品名称(XPath: //h1)
|
||||
logger.info("正在提取产品名称...")
|
||||
try:
|
||||
await self.record_dom_selection("//h1", "产品名称")
|
||||
name_element = await self.page.query_selector("xpath=//h1")
|
||||
if name_element:
|
||||
product_info["name"] = (await name_element.text_content()).strip()
|
||||
logger.info(f"产品名称: {product_info['name']}")
|
||||
else:
|
||||
logger.warning("未找到XPath为//h1的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取产品名称失败: {e}")
|
||||
|
||||
# 提取产品简介(class为"relative text-16 font-normal text-gray-700"的div)
|
||||
# 提取产品简介(XPath: //*[@class=\"relative text-16 font-normal text-gray-700\"]//div)
|
||||
logger.info("正在提取产品简介...")
|
||||
try:
|
||||
await self.record_dom_selection('div.relative.text-16.font-normal.text-gray-700', "产品简介")
|
||||
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
|
||||
if intro_div:
|
||||
product_info["introduction"] = (await intro_div.text_content()).strip()
|
||||
await self.record_dom_selection('//*[@class="relative text-16 font-normal text-gray-700"]//div', "产品简介")
|
||||
intro_element = await self.page.query_selector('xpath=//*[@class="relative text-16 font-normal text-gray-700"]//div')
|
||||
if intro_element:
|
||||
product_info["introduction"] = (await intro_element.text_content()).strip()
|
||||
logger.info(f"产品简介: {product_info['introduction'][:200]}...")
|
||||
else:
|
||||
logger.warning("未找到class为'relative text-16 font-normal text-gray-700'的div")
|
||||
logger.warning("未找到XPath为//*[@class=\"relative text-16 font-normal text-gray-700\"]//div的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取产品简介失败: {e}")
|
||||
|
||||
# 等待制作人发言动态加载(等待class="flex flex-col gap-2"的section标签出现)
|
||||
logger.info("等待制作人发言动态加载...")
|
||||
# 提取用户数(XPath: //*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p)
|
||||
logger.info("正在提取用户数...")
|
||||
try:
|
||||
# 等待section标签出现,最长等待60秒
|
||||
await self.record_dom_selection('section.flex.flex-col.gap-2', "制作人发言区域")
|
||||
section_element = await self.page.wait_for_selector(
|
||||
'section.flex.flex-col.gap-2',
|
||||
timeout=60000
|
||||
)
|
||||
if section_element:
|
||||
logger.success("制作人发言区域已加载")
|
||||
await self.record_dom_selection('//*[@class="flex flex-row gap-2"]//div/div[2]/span/p', "用户数")
|
||||
user_count_element = await self.page.query_selector('xpath=//*[@class="flex flex-row gap-2"]//div/div[2]/span/p')
|
||||
if user_count_element:
|
||||
product_info["user_count"] = (await user_count_element.text_content()).strip()
|
||||
logger.info(f"用户数: {product_info['user_count']}")
|
||||
else:
|
||||
logger.warning("未找到XPath为//*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取用户数失败: {e}")
|
||||
|
||||
# 查找section标签下面的第一个a标签
|
||||
await self.record_dom_selection('a', "制作人链接")
|
||||
a_element = await section_element.query_selector('a')
|
||||
# 提取制作人发言链接(XPath: //span[contains(@class, \"absolute\")]的父级a标签)
|
||||
logger.info("正在提取制作人发言链接...")
|
||||
try:
|
||||
# 增加显性等待,等待页面元素加载完成
|
||||
logger.info("等待页面元素加载...")
|
||||
await self.page.wait_for_timeout(20000) # 等待20秒
|
||||
|
||||
# 先找到包含class="absolute"的span元素
|
||||
await self.record_dom_selection('//span[contains(@class, "absolute")]', "制作人span标签")
|
||||
span_element = await self.page.query_selector('xpath=//span[contains(@class, "absolute")]')
|
||||
if span_element:
|
||||
# 找到span元素的父级a标签
|
||||
await self.record_dom_selection('//span[contains(@class, "absolute")]/parent::a', "制作人链接")
|
||||
|
||||
# 使用更可靠的方法获取父级a标签
|
||||
a_element = await span_element.evaluate_handle('(element) => element.closest("a")')
|
||||
|
||||
# 检查a_element是否为有效的元素句柄
|
||||
if a_element:
|
||||
# 提取a标签的文本内容
|
||||
maker_text = (await a_element.text_content()).strip()
|
||||
@@ -381,35 +362,14 @@ class ProductHuntScraper:
|
||||
product_info["maker_link"] = maker_link
|
||||
logger.info(f"制作人链接: {maker_link}")
|
||||
|
||||
# 调用子函数在新窗口中提取制作人发言
|
||||
product_info["maker_statement"] = await self.extract_maker_statement_from_new_window(maker_link, maker_text)
|
||||
|
||||
# 调用子函数在当前窗口中提取制作人发言
|
||||
product_info["maker_statement"] = await self.extract_maker_statement_from_current_window(maker_link, maker_text)
|
||||
else:
|
||||
logger.warning("在section中未找到a标签")
|
||||
# 如果没有a标签,尝试查找span标签
|
||||
span_element = await section_element.query_selector('span')
|
||||
if span_element:
|
||||
product_info["maker_statement"] = (await span_element.text_content()).strip()
|
||||
logger.info(f"制作人发言(回退span): {product_info['maker_statement'][:200]}...")
|
||||
else:
|
||||
logger.warning("未找到span标签")
|
||||
|
||||
logger.warning("未找到制作人链接的a标签")
|
||||
else:
|
||||
logger.warning("制作人发言区域未加载")
|
||||
logger.warning("未找到XPath为//span[contains(@class, \"absolute\")]的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"等待制作人发言加载失败: {e}")
|
||||
|
||||
# 提取用户数(class="text-14 font-medium text-gray-700"的p标签)
|
||||
logger.info("正在提取用户数...")
|
||||
try:
|
||||
user_count_element = await self.page.query_selector('p.text-14.font-medium.text-gray-700')
|
||||
if user_count_element:
|
||||
product_info["user_count"] = (await user_count_element.text_content()).strip()
|
||||
logger.info(f"用户数: {product_info['user_count']}")
|
||||
else:
|
||||
logger.warning("未找到class为'text-14 font-medium text-gray-700'的p标签")
|
||||
except Exception as e:
|
||||
logger.error(f"提取用户数失败: {e}")
|
||||
logger.error(f"提取制作人发言链接失败: {e}")
|
||||
|
||||
# 保存到临时文件
|
||||
temp_file_path = "temp_product_info.txt"
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"name": "Notion",
|
||||
"introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
|
||||
"user_count": "15K followers",
|
||||
"maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
|
||||
"maker_statement": "AI Meeting Notes by Notion"
|
||||
"maker_statement": "Hey Product Hunt — I’m Frank, a product designer at Notion. Today, I’m excited to introduce you to our newest kid on the block: AI Meeting Notes — or as I like to call it, /meet.With AI Meeting Notes, you get perfect meeting memory in Notion. No bots. No app switching. Just a simple /meet command on any page or one click from your Notion Calendar.Why Notion? Because your meeting notes live right where you work — connected to your docs, projects, and team. No more copy-pasting, just instant answers, searchable history, and workflows that flow.We’re already seeing folks use it not only at work, but also at home, in therapy, even in deep conversations with partners. It’s still early — we’re just graduating from alpha — but we’re moving fast and building with heart.Try it in your next few meetings. Let us know how it goes — DMs open for feedback, bugs, ideas, anything.This is, after all, our Notion.— Frank"
|
||||
}
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 804 KiB After Width: | Height: | Size: 351 KiB |
@@ -1,25 +0,0 @@
|
||||
2025-11-16 22:35:51.167 | INFO | __main__:connect_to_chrome:44 - 尝试连接到Chrome调试实例: 127.0.0.1:5003
|
||||
2025-11-16 22:35:53.051 | ERROR | __main__:connect_to_chrome:54 - 连接Chrome实例失败: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location
|
||||
|
||||
2025-11-16 22:35:53.051 | ERROR | __main__:scrape_product:163 - 无法连接到Chrome实例
|
||||
2025-11-16 22:35:53.051 | ERROR | __main__:<module>:226 - 未能获取产品信息
|
||||
2025-11-16 22:36:46.142 | INFO | __main__:connect_to_chrome:46 - 尝试连接到Chrome调试实例: 127.0.0.1:5003
|
||||
2025-11-16 22:36:48.728 | ERROR | __main__:connect_to_chrome:58 - 连接Chrome实例失败: Could not reach host. Are you offline?
|
||||
2025-11-16 22:36:48.728 | ERROR | __main__:scrape_product:167 - 无法连接到Chrome实例
|
||||
2025-11-16 22:36:48.729 | ERROR | __main__:<module>:230 - 未能获取产品信息
|
||||
2025-11-16 22:37:25.954 | INFO | __main__:connect_to_chrome:56 - 启动新的Chrome实例
|
||||
2025-11-16 22:37:28.424 | ERROR | __main__:connect_to_chrome:73 - 连接Chrome实例失败: Could not reach host. Are you offline?
|
||||
2025-11-16 22:37:28.424 | ERROR | __main__:scrape_product:182 - 无法连接到Chrome实例
|
||||
2025-11-16 22:37:28.424 | ERROR | __main__:<module>:246 - 未能获取产品信息
|
||||
2025-11-16 23:05:28.533 | INFO | __main__:connect_to_chrome:46 - 尝试连接到Chrome调试实例: 127.0.0.1:5003
|
||||
2025-11-16 23:05:30.031 | ERROR | __main__:connect_to_chrome:58 - 连接Chrome实例失败: Could not reach host. Are you offline?
|
||||
2025-11-16 23:05:30.032 | ERROR | __main__:scrape_product:167 - 无法连接到Chrome实例
|
||||
2025-11-16 23:05:30.032 | ERROR | __main__:<module>:230 - 未能获取产品信息
|
||||
2025-11-16 23:05:56.296 | INFO | __main__:connect_to_chrome:46 - 尝试连接到Chrome调试实例: 127.0.0.1:5003
|
||||
2025-11-16 23:05:57.798 | ERROR | __main__:connect_to_chrome:58 - 连接Chrome实例失败: Could not reach host. Are you offline?
|
||||
2025-11-16 23:05:57.799 | ERROR | __main__:scrape_product:167 - 无法连接到Chrome实例
|
||||
2025-11-16 23:05:57.799 | ERROR | __main__:<module>:230 - 未能获取产品信息
|
||||
2025-11-16 23:07:50.808 | INFO | __main__:connect_to_chrome:46 - 尝试连接到Chrome调试实例: 127.0.0.1:5003
|
||||
2025-11-16 23:07:52.379 | ERROR | __main__:connect_to_chrome:58 - 连接Chrome实例失败: Could not reach host. Are you offline?
|
||||
2025-11-16 23:07:52.380 | ERROR | __main__:scrape_product:167 - 无法连接到Chrome实例
|
||||
2025-11-16 23:07:52.380 | ERROR | __main__:<module>:230 - 未能获取产品信息
|
||||
@@ -1,42 +0,0 @@
|
||||
# PowerShell脚本:启动Chrome浏览器并启用远程调试
|
||||
|
||||
# Chrome浏览器路径
|
||||
$chromePath = "C:\Program Files\Google\Chrome\Application\chrome.exe"
|
||||
|
||||
# 检查Chrome是否已安装
|
||||
if (-not (Test-Path $chromePath)) {
|
||||
Write-Host "错误:Chrome浏览器未找到,请检查安装路径" -ForegroundColor Red
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 用户数据目录
|
||||
$userDataDir = "C:\temp\chrome_debug"
|
||||
|
||||
# 创建用户数据目录(如果不存在)
|
||||
if (-not (Test-Path $userDataDir)) {
|
||||
New-Item -ItemType Directory -Path $userDataDir -Force | Out-Null
|
||||
Write-Host "已创建用户数据目录: $userDataDir" -ForegroundColor Green
|
||||
}
|
||||
|
||||
# 启动Chrome浏览器
|
||||
$arguments = @(
|
||||
"--remote-debugging-port=9222",
|
||||
"--start-maximized",
|
||||
"--user-data-dir=`"$userDataDir`""
|
||||
)
|
||||
|
||||
Write-Host "正在启动Chrome浏览器..." -ForegroundColor Yellow
|
||||
Write-Host "命令: $chromePath $arguments" -ForegroundColor Cyan
|
||||
|
||||
# 启动Chrome进程
|
||||
$process = Start-Process -FilePath $chromePath -ArgumentList $arguments -PassThru
|
||||
|
||||
if ($process) {
|
||||
Write-Host "Chrome浏览器已启动,进程ID: $($process.Id)" -ForegroundColor Green
|
||||
Write-Host "远程调试端口: 9222" -ForegroundColor Green
|
||||
Write-Host "用户数据目录: $userDataDir" -ForegroundColor Green
|
||||
Write-Host ""
|
||||
Write-Host "现在可以运行Playwright脚本来连接此Chrome实例" -ForegroundColor Yellow
|
||||
} else {
|
||||
Write-Host "启动Chrome浏览器失败" -ForegroundColor Red
|
||||
}
|
||||
@@ -4,8 +4,8 @@
|
||||
|
||||
产品简介: Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.
|
||||
|
||||
制作人发言: AI Meeting Notes by Notion
|
||||
制作人发言: Hey Product Hunt — I’m Frank, a product designer at Notion. Today, I’m excited to introduce you to our newest kid on the block: AI Meeting Notes — or as I like to call it, /meet.With AI Meeting Notes, you get perfect meeting memory in Notion. No bots. No app switching. Just a simple /meet command on any page or one click from your Notion Calendar.Why Notion? Because your meeting notes live right where you work — connected to your docs, projects, and team. No more copy-pasting, just instant answers, searchable history, and workflows that flow.We’re already seeing folks use it not only at work, but also at home, in therapy, even in deep conversations with partners. It’s still early — we’re just graduating from alpha — but we’re moving fast and building with heart.Try it in your next few meetings. Let us know how it goes — DMs open for feedback, bugs, ideas, anything.This is, after all, our Notion.— Frank
|
||||
|
||||
用户数: 未获取
|
||||
用户数: 15K followers
|
||||
|
||||
提取时间: 2025-11-19 22:46:52
|
||||
提取时间: 2025-11-20 22:58:34
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
tophub_data.db
BIN
tophub_data.db
Binary file not shown.
22930
tophub_scraper.log
22930
tophub_scraper.log
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user