更新了抓取producthunt的代码

This commit is contained in:
2025-11-23 22:14:53 +08:00
parent 9088939701
commit 4a48b9a9cb
9 changed files with 260 additions and 480 deletions

View File

@@ -31,7 +31,7 @@ logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</
class ProductHuntScraperFull:
"""全功能ProductHunt数据抓取器"""
def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=10, skip_duplicates=True):
def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=0, skip_duplicates=True):
"""
初始化抓取器
@@ -68,12 +68,9 @@ class ProductHuntScraperFull:
conn = sqlite3.connect(self.tophub_db_path)
cursor = conn.cursor()
# 查询包含producthunt.com的链接
if limit > 0:
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%' LIMIT ?", (limit,))
else:
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
# 查询包含producthunt.com的链接去掉LIMIT限制
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
urls = [row[0] for row in cursor.fetchall()]
conn.close()
@@ -322,7 +319,7 @@ def parse_arguments():
parser.add_argument("--tophub-db", help="tophub数据库路径", default=None)
parser.add_argument("--product-db", help="产品数据库路径", default=None)
parser.add_argument("--debug-port", type=int, help="Chrome调试端口", default=9222)
parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=10)
parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=0)
parser.add_argument("--no-skip-duplicates", action="store_true", help="不跳过重复URL")
parser.add_argument("--urls", nargs="+", help="指定要抓取的URL列表")
parser.add_argument("--log-file", help="日志文件路径", default="producthunt_scraper.log")

View File

@@ -114,7 +114,7 @@ class ProductHuntScraper:
# 等待页面标题包含"Product Hunt"最长等待300秒
logger.info("等待页面标题包含'Product Hunt'...")
max_wait_time = 300 # 最大等待时间(秒)
max_wait_time = 60 # 最大等待时间(秒)
wait_interval = 5 # 检查间隔(秒)
waited_time = 0
@@ -129,6 +129,23 @@ class ProductHuntScraper:
logger.success("Product Hunt网站已成功打开")
return True
# 检查是否遇到Cloudflare验证
if "Just a moment" in title or "请稍候" in title or "Checking your browser" in title:
logger.info("遇到Cloudflare验证等待验证完成...")
await asyncio.sleep(10) # 等待10秒
waited_time += 10
continue
# 检查是否已成功加载页面内容
try:
# 尝试查找页面中的关键元素
h1_element = await self.page.query_selector("h1")
if h1_element:
logger.success("检测到页面内容已加载")
return True
except Exception:
pass
# 等待一段时间后再次检查
await asyncio.sleep(wait_interval)
waited_time += wait_interval
@@ -165,59 +182,231 @@ class ProductHuntScraper:
# 记录点击制作人链接的行为
await self.record_click("制作人链接", "点击制作人链接在当前窗口打开")
# 保存当前页面的URL以便后续返回
original_url = self.page.url
logger.info(f"保存当前页面URL: {original_url}")
# 在当前页面导航到制作人链接
logger.info(f"正在在当前窗口打开制作人链接: {maker_link}")
await self.page.goto(maker_link, wait_until="domcontentloaded")
# 设置更长的超时时间来处理模态窗口
try:
await self.page.goto(maker_link, wait_until="domcontentloaded", timeout=60000)
logger.success("页面导航成功")
except Exception as e:
logger.error(f"页面导航失败: {e}")
# 尝试返回原始页面
try:
await self.page.goto(original_url, wait_until="domcontentloaded")
logger.success(f"已返回原始页面: {original_url}")
except Exception as return_error:
logger.error(f"返回原始页面失败: {return_error}")
return ""
# 等待页面加载
await self.page.wait_for_load_state("networkidle")
# 等待title元素出现并包含产品名称最长等待2分钟
logger.info("等待title元素出现并包含产品名称最长等待2分钟...")
# 检查并处理可能的模态窗口
try:
# 等待title元素出现最长等待2分钟
await self.page.wait_for_selector("title", timeout=120000)
logger.info("检查是否存在模态窗口...")
modal_selectors = [
"[role='dialog']",
".modal",
".modal-dialog",
"[data-testid='modal']",
"[class*='modal']",
"[class*='overlay']",
"[class*='dialog']",
"[class*='popup']"
]
# 检查title是否包含产品名称
for selector in modal_selectors:
try:
modal_element = await self.page.query_selector(selector)
if modal_element:
logger.info(f"检测到模态窗口,选择器: {selector}")
# 尝试关闭模态窗口
close_selectors = [
"[aria-label='Close']",
".close",
".modal-close",
"[data-testid='close']",
"button:has-text('Close')",
"button:has-text('关闭')",
"button:has-text('X')"
]
for close_selector in close_selectors:
try:
close_button = await modal_element.query_selector(close_selector)
if close_button:
await close_button.click()
logger.success(f"已关闭模态窗口,使用选择器: {close_selector}")
await self.page.wait_for_timeout(1000) # 等待关闭动画
break
except Exception:
continue
# 如果模态窗口仍然存在,尝试点击模态窗口外部关闭
try:
await self.page.mouse.click(10, 10) # 点击页面左上角
logger.info("尝试点击页面外部关闭模态窗口")
await self.page.wait_for_timeout(1000)
except Exception:
pass
break
except Exception:
continue
except Exception as e:
logger.warning(f"检查模态窗口时出错: {e}")
# 快速检查页面是否已加载
logger.info("快速检查页面加载状态...")
# 立即尝试获取页面内容,不等待特定元素
try:
title_text = await self.page.title()
logger.info(f"页面标题: {title_text}")
# 获取产品名称从maker_text参数中获取
product_name = maker_text.strip() if maker_text else ""
if product_name and product_name.lower() in title_text.lower():
logger.success(f"标题包含产品名称: {product_name}")
else:
logger.warning(f"标题不包含产品名称,产品名称: {product_name}")
except Exception as e:
logger.error(f"等待title元素失败: {e}")
logger.warning(f"获取页面标题失败: {e}")
# 再等待30秒确保页面完全加载
logger.info("再等待30秒确保页面完全加载...")
await self.page.wait_for_timeout(30000) # 等待30秒
# 提取制作人评论内容XPath: //*[@id=\"comment-4597755\"]/div/div[2]/div/div/div
logger.info("正在提取制作人评论内容...")
# 快速检查页面是否有内容
try:
# 使用XPath查找评论元素
comment_element = await self.page.query_selector(
'xpath=//*[@id="comment-4597755"]/div/div[2]/div/div/div'
)
if comment_element:
maker_statement = (await comment_element.text_content()).strip()
logger.info(f"制作人评论内容: {maker_statement[:200]}...")
return maker_statement
else:
logger.warning("未找到XPath为//*[@id=\"comment-4597755\"]/div/div[2]/div/div/div的元素")
body_element = await self.page.query_selector("body")
if body_element:
body_text = await body_element.text_content()
if len(body_text.strip()) > 10:
logger.success("页面内容已加载")
else:
logger.warning("页面内容为空或过短")
except Exception as e:
logger.error(f"提取制作人评论内容失败: {e}")
logger.warning(f"检查页面内容失败: {e}")
# 短暂等待确保DOM稳定
logger.info("等待DOM稳定...")
await self.page.wait_for_timeout(2000) # 等待2秒
# 保存模态窗口截图用于调试
modal_screenshot = "modal_window_debug.png"
await self.page.screenshot(path=modal_screenshot, full_page=True)
logger.info(f"模态窗口调试截图已保存到: {modal_screenshot}")
# 首先检查页面内容,获取页面主要文本
try:
page_content = await self.page.content()
logger.info("页面内容已获取")
# 检查页面是否包含常见的关键词
keywords = ['comment', 'discussion', 'maker', 'creator', 'author', 'statement', 'description']
found_keywords = [kw for kw in keywords if kw in page_content.lower()]
if found_keywords:
logger.info(f"页面包含关键词: {found_keywords}")
else:
logger.warning("页面未检测到常见关键词")
except Exception as e:
logger.error(f"获取页面内容失败: {e}")
# 提取制作人评论内容 - 针对模态窗口的多种选择器策略
logger.info("正在提取制作人评论内容...")
# 策略1尝试多种XPath选择器
xpath_selectors = [
# 新的主要选择器包含prose、prose-format和richText类的div
"//div[contains(@class, 'prose') and contains(@class, 'prose-format') and contains(@class, 'richText')]",
# 备用选择器
'//*[@id="comment-4597755"]/div/div[2]/div/div/div', # 原始选择器
'//div[contains(@class, "comment")]//div[contains(@class, "text")]', # 通用评论选择器
'//div[contains(@class, "modal")]//div[contains(@class, "content")]', # 模态窗口内容
'//div[contains(@class, "dialog")]//div[contains(@class, "body")]', # 对话框内容
'//section//div[contains(@class, "text")]', # section内的文本内容
'//div[contains(@class, "launch")]//div[contains(@class, "description")]', # 发布描述
'//article//div[contains(@class, "content")]', # 文章内容
'//main//div[contains(@class, "text")]', # 主要内容区文本
# 其他备用选择器
"//div[contains(@class, 'styles_commentsContainer')]//div[contains(@class, 'styles_comment')]//div[contains(@class, 'styles_commentBody')]//p",
"//div[contains(@class, 'comment')]//p",
"//div[contains(@class, 'comments')]//p",
]
for i, xpath in enumerate(xpath_selectors, 1):
try:
logger.info(f"尝试选择器 {i}/{len(xpath_selectors)}: {xpath}")
comment_element = await self.page.query_selector(f'xpath={xpath}')
if comment_element:
maker_statement = (await comment_element.text_content()).strip()
if maker_statement: # 确保有内容
logger.success(f"使用选择器 {i} 成功提取制作人评论内容: {maker_statement[:200]}...")
# 提取完成后返回原始页面
logger.info("提取完成,正在返回原始产品页面...")
await self.page.goto(original_url, wait_until="domcontentloaded")
logger.success(f"已成功返回原始页面: {original_url}")
return maker_statement
else:
logger.warning(f"选择器 {i} 提取的内容为空")
except Exception as e:
logger.warning(f"选择器 {i} 失败: {e}")
# 策略2如果所有选择器都失败尝试提取页面主要文本内容
logger.info("所有选择器失败,尝试提取页面主要文本内容...")
try:
# 获取页面body文本
body_element = await self.page.query_selector('body')
if body_element:
full_text = (await body_element.text_content()).strip()
# 提取前500个字符作为制作人发言
if len(full_text) > 100:
maker_statement = full_text[:500]
logger.info(f"提取页面主要文本内容: {maker_statement[:200]}...")
# 提取完成后返回原始页面
logger.info("提取完成,正在返回原始产品页面...")
await self.page.goto(original_url, wait_until="domcontentloaded")
logger.success(f"已成功返回原始页面: {original_url}")
return maker_statement
except Exception as e:
logger.error(f"提取页面主要文本内容失败: {e}")
# 策略3如果仍然失败记录页面截图以便调试
logger.warning("所有提取策略都失败,保存截图用于调试...")
try:
screenshot_path = "modal_debug_screenshot.png"
await self.page.screenshot(path=screenshot_path, full_page=True)
logger.info(f"模态窗口截图已保存到: {screenshot_path}")
except Exception as e:
logger.error(f"保存截图失败: {e}")
# 即使未找到元素,也返回原始页面
logger.info("正在返回原始产品页面...")
await self.page.goto(original_url, wait_until="domcontentloaded")
logger.success(f"已成功返回原始页面: {original_url}")
return ""
except Exception as e:
logger.error(f"在当前窗口打开制作人链接失败: {e}")
# 保存当前页面截图用于调试
try:
debug_screenshot = "debug_maker_link_failure.png"
await self.page.screenshot(path=debug_screenshot, full_page=True)
logger.info(f"错误调试截图已保存到: {debug_screenshot}")
except Exception as screenshot_error:
logger.error(f"保存调试截图失败: {screenshot_error}")
# 发生异常时也尝试返回原始页面
try:
logger.info("发生异常,尝试返回原始产品页面...")
await self.page.goto(original_url, wait_until="domcontentloaded")
logger.success(f"已成功返回原始页面: {original_url}")
except Exception as return_error:
logger.error(f"返回原始页面失败: {return_error}")
return ""
async def _extract_maker_statement_direct_open(self, maker_link, maker_text):
@@ -351,19 +540,30 @@ class ProductHuntScraper:
maker_link = await a_element.get_attribute('href')
# 拼凑完整的URL
if maker_link and not maker_link.startswith('http'):
# 如果是相对路径拼凑为完整URL
base_url = "https://www.producthunt.com"
if maker_link.startswith('/'):
maker_link = base_url + maker_link
if maker_link:
if not maker_link.startswith('http'):
# 如果是相对路径拼凑为完整URL
base_url = "https://www.producthunt.com"
if maker_link.startswith('/'):
maker_link = base_url + maker_link
else:
maker_link = base_url + '/' + maker_link
# 验证URL是否有效不能只是根路径
if maker_link == "https://www.producthunt.com/" or maker_link == "https://www.producthunt.com":
logger.warning(f"制作人链接无效,跳过提取: {maker_link}")
product_info["maker_link"] = ""
product_info["maker_statement"] = ""
else:
maker_link = base_url + '/' + maker_link
product_info["maker_link"] = maker_link
logger.info(f"制作人链接: {maker_link}")
# 调用子函数在当前窗口中提取制作人发言
product_info["maker_statement"] = await self.extract_maker_statement_from_current_window(maker_link, maker_text)
product_info["maker_link"] = maker_link
logger.info(f"制作人链接: {maker_link}")
# 调用子函数在当前窗口中提取制作人发言
product_info["maker_statement"] = await self.extract_maker_statement_from_current_window(maker_link, maker_text)
else:
logger.warning("未获取到制作人链接")
product_info["maker_link"] = ""
product_info["maker_statement"] = ""
else:
logger.warning("未找到制作人链接的a标签")
else:
@@ -410,7 +610,7 @@ async def main():
logger.info("开始ProductHunt数据抓取任务")
# 目标URL
target_url = "https://www.producthunt.com/products/notion"
target_url = "https://www.producthunt.com/products/palettebrain"
# 创建抓取器实例
scraper = ProductHuntScraper(debug_port=9222)

View File

@@ -0,0 +1,7 @@
{
"name": "Raycast",
"introduction": "A collection of powerful productivity tools all within an extendable launcher. Fast, ergonomic and reliable.",
"user_count": "17K followers",
"maker_link": "https://www.producthunt.com/products/raycast/launches/product-hunt-for-raycast",
"maker_statement": "Raycast for Windows"
}

Binary file not shown.

View File

@@ -1,7 +0,0 @@
{
"name": "Notion",
"introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
"user_count": "15K followers",
"maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
"maker_statement": "Hey Product Hunt — Im Frank, a product designer at Notion. Today, Im excited to introduce you to our newest kid on the block: AI Meeting Notes — or as I like to call it, /meet.With AI Meeting Notes, you get perfect meeting memory in Notion. No bots. No app switching. Just a simple /meet command on any page or one click from your Notion Calendar.Why Notion? Because your meeting notes live right where you work — connected to your docs, projects, and team. No more copy-pasting, just instant answers, searchable history, and workflows that flow.Were already seeing folks use it not only at work, but also at home, in therapy, even in deep conversations with partners. Its still early — were just graduating from alpha — but were moving fast and building with heart.Try it in your next few meetings. Let us know how it goes — DMs open for feedback, bugs, ideas, anything.This is, after all, our Notion.— Frank"
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 371 KiB

View File

@@ -1,406 +0,0 @@
2025-11-23 11:00:52.606 | INFO | __main__:run_scraping:229 - === 开始ProductHunt数据抓取 ===
2025-11-23 11:00:52.607 | INFO | __main__:init_product_database:90 - 正在初始化产品数据库...
2025-11-23 11:00:52.613 | SUCCESS | __main__:init_product_database:113 - 产品数据库初始化完成
2025-11-23 11:00:52.613 | INFO | __main__:query_producthunt_urls:65 - 正在查询tophub_data.db数据库限制: 10条
2025-11-23 11:00:52.617 | SUCCESS | __main__:query_producthunt_urls:81 - 找到 10 个包含producthunt.com的链接
2025-11-23 11:00:52.617 | INFO | __main__:run_scraping:244 - 找到 10 个ProductHunt链接
2025-11-23 11:00:52.624 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/pixley-ai
2025-11-23 11:00:52.624 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/pixley-ai
2025-11-23 11:00:52.624 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:54.060 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:54.060 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:54.060 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/pixley-ai
2025-11-23 11:00:54.060 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/burner-2
2025-11-23 11:00:54.061 | INFO | __main__:run_scraping:258 - URL已存在跳过: https://www.producthunt.com/products/burner-2
2025-11-23 11:00:54.061 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/american-ratings-lead-magnet-portal
2025-11-23 11:00:54.062 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/american-ratings-lead-magnet-portal
2025-11-23 11:00:54.062 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:54.697 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:54.697 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:54.697 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/american-ratings-lead-magnet-portal
2025-11-23 11:00:54.697 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/builder-io
2025-11-23 11:00:54.698 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/builder-io
2025-11-23 11:00:54.698 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:55.333 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:55.333 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:55.333 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/builder-io
2025-11-23 11:00:55.333 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/beebot-for-airpods
2025-11-23 11:00:55.334 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/beebot-for-airpods
2025-11-23 11:00:55.334 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:55.956 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:55.956 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:55.956 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/beebot-for-airpods
2025-11-23 11:00:55.957 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/iisee-me
2025-11-23 11:00:55.958 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/iisee-me
2025-11-23 11:00:55.958 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:56.595 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:56.595 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:56.595 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/iisee-me
2025-11-23 11:00:56.595 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/melodic-mind-2
2025-11-23 11:00:56.596 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/melodic-mind-2
2025-11-23 11:00:56.596 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:57.200 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:57.200 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:57.201 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/melodic-mind-2
2025-11-23 11:00:57.201 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/agor
2025-11-23 11:00:57.202 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/agor
2025-11-23 11:00:57.202 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:57.824 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:57.824 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:57.824 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/agor
2025-11-23 11:00:57.825 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/quiteinbox
2025-11-23 11:00:57.826 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/quiteinbox
2025-11-23 11:00:57.826 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:58.451 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:58.451 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:58.452 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/quiteinbox
2025-11-23 11:00:58.452 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/everywhere
2025-11-23 11:00:58.453 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/everywhere
2025-11-23 11:00:58.453 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:00:59.070 | ERROR | playwright_get_data:connect_to_existing_chrome:61 - 连接Chrome失败: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
- <ws preparing> retrieving websocket url from http://localhost:9222
2025-11-23 11:00:59.070 | ERROR | __main__:scrape_product_info:200 - 连接Chrome失败跳过此URL
2025-11-23 11:00:59.070 | ERROR | __main__:run_scraping:276 - 抓取产品信息失败: https://www.producthunt.com/products/everywhere
2025-11-23 11:00:59.071 | INFO | __main__:show_scraping_results:303 - === 抓取结果统计 ===
2025-11-23 11:00:59.071 | INFO | __main__:show_scraping_results:304 - 成功抓取: 0 个产品
2025-11-23 11:00:59.072 | INFO | __main__:show_scraping_results:305 - 跳过重复: 1 个链接
2025-11-23 11:00:59.072 | INFO | __main__:show_scraping_results:306 - 抓取失败: 9 个链接
2025-11-23 11:00:59.072 | INFO | __main__:show_scraping_results:307 - 数据库中的产品总数: 1
2025-11-23 11:00:59.072 | INFO | __main__:show_scraping_results:310 - 最新抓取的产品:
2025-11-23 11:00:59.072 | INFO | __main__:show_scraping_results:312 - - Burner: https://www.producthunt.com/products/burner-2
2025-11-23 11:00:59.072 | SUCCESS | __main__:run_scraping:284 - === ProductHunt数据抓取完成 ===
2025-11-23 11:01:18.968 | INFO | __main__:run_scraping:229 - === 开始ProductHunt数据抓取 ===
2025-11-23 11:01:18.969 | INFO | __main__:init_product_database:90 - 正在初始化产品数据库...
2025-11-23 11:01:18.970 | SUCCESS | __main__:init_product_database:113 - 产品数据库初始化完成
2025-11-23 11:01:18.970 | INFO | __main__:query_producthunt_urls:65 - 正在查询tophub_data.db数据库限制: 10条
2025-11-23 11:01:18.970 | SUCCESS | __main__:query_producthunt_urls:81 - 找到 10 个包含producthunt.com的链接
2025-11-23 11:01:18.970 | INFO | __main__:run_scraping:244 - 找到 10 个ProductHunt链接
2025-11-23 11:01:18.973 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/pixley-ai
2025-11-23 11:01:18.973 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/pixley-ai
2025-11-23 11:01:18.974 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:01:19.626 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:01:19.626 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/pixley-ai
2025-11-23 11:01:21.582 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:01:21.672 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: Pixley AI: Pixley lets kids turn their ideas into cartoons in minutes | Product Hunt
2025-11-23 11:01:21.672 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:01:21.672 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:01:21.672 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:01:21.673 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:01:21.724 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: Pixley AI
2025-11-23 11:01:21.724 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:01:21.725 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:01:21.732 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: Pixley is the first platform that lets children turn their drawings and ideas into personalized, animated cartoons in minutes. Until now, making animation was slow, expensive, and impossible to person...
2025-11-23 11:01:21.732 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:01:21.732 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:01:21.738 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 53 followers
2025-11-23 11:01:21.738 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:01:21.738 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:01:41.743 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:01:41.751 | WARNING | playwright_get_data:extract_product_info:370 - 未找到XPath为//span[contains(@class, "absolute")]的元素
2025-11-23 11:01:41.753 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:01:42.074 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:01:42.074 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: Pixley AI
2025-11-23 11:01:42.080 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:01:42.093 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:01:42.094 | INFO | __main__:save_product_info:179 - 新增产品信息: Pixley AI
2025-11-23 11:01:42.097 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: Pixley AI
2025-11-23 11:01:42.098 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/burner-2
2025-11-23 11:01:42.098 | INFO | __main__:run_scraping:258 - URL已存在跳过: https://www.producthunt.com/products/burner-2
2025-11-23 11:01:42.099 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/american-ratings-lead-magnet-portal
2025-11-23 11:01:42.099 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/american-ratings-lead-magnet-portal
2025-11-23 11:01:42.099 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:01:42.765 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:01:42.765 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/american-ratings-lead-magnet-portal
2025-11-23 11:02:02.769 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:02:02.775 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: American Ratings Lead Magnet Portal: Get Your Verified A-I-R-S Number & Boost Global Credibility | Product Hunt
2025-11-23 11:02:02.775 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:02:02.775 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:02:02.776 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:02:02.776 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:02:02.807 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: American Ratings Lead Magnet Portal
2025-11-23 11:02:02.807 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:02:02.808 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:02:02.814 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: Build verified business credibility with the American Ratings Lead Magnet Portal — the trusted platform for authentic verification and global rating credentials. Get your A-I-R-S Number to showcase tr...
2025-11-23 11:02:02.815 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:02:02.815 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:02:02.821 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 24 followers
2025-11-23 11:02:02.821 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:02:02.821 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:02:22.834 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:02:22.842 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人链接 - 选择器: //span[contains(@class, "absolute")]/parent::a
2025-11-23 11:02:22.852 | INFO | playwright_get_data:extract_product_info:363 - 制作人链接: https://www.producthunt.com/p/american-ratings-lead-magnet-portal/a-i-r-s-number-american-ratings-lead-magnet-webinar-channel-partner-credit-100k-25m
2025-11-23 11:02:22.852 | INFO | playwright_get_data:record_click:75 - 记录点击: - 坐标(制作人链接, 点击制作人链接在当前窗口打开) - 选择器:
2025-11-23 11:02:22.852 | INFO | playwright_get_data:extract_maker_statement_from_current_window:169 - 正在在当前窗口打开制作人链接: https://www.producthunt.com/p/american-ratings-lead-magnet-portal/a-i-r-s-number-american-ratings-lead-magnet-webinar-channel-partner-credit-100k-25m
2025-11-23 11:02:55.175 | ERROR | playwright_get_data:extract_maker_statement_from_current_window:220 - 在当前窗口打开制作人链接失败: Timeout 30000ms exceeded.
2025-11-23 11:02:55.176 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:02:55.513 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:02:55.514 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: American Ratings Lead Magnet Portal
2025-11-23 11:02:55.519 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:02:55.529 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:02:55.532 | INFO | __main__:save_product_info:179 - 新增产品信息: American Ratings Lead Magnet Portal
2025-11-23 11:02:55.535 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: American Ratings Lead Magnet Portal
2025-11-23 11:02:55.536 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/builder-io
2025-11-23 11:02:55.537 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/builder-io
2025-11-23 11:02:55.537 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:02:56.193 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:02:56.194 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/builder-io
2025-11-23 11:02:59.528 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:02:59.549 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: Builder.io: The first AI agent for product, design, and code | Product Hunt
2025-11-23 11:02:59.549 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:02:59.549 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:02:59.549 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:02:59.550 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:02:59.590 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: Builder.io
2025-11-23 11:02:59.590 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:02:59.590 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:02:59.595 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: The first AI agent that unifies product, design, and code. It connects Slack, Jira, Figma, and your repo to turn ideas into production features. Edit visually with real code, sync designs bidirectiona...
2025-11-23 11:02:59.595 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:02:59.595 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:02:59.600 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 1.9K followers
2025-11-23 11:02:59.600 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:02:59.600 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:03:19.603 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:03:19.608 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人链接 - 选择器: //span[contains(@class, "absolute")]/parent::a
2025-11-23 11:03:19.616 | INFO | playwright_get_data:extract_product_info:363 - 制作人链接: https://www.producthunt.com/products/builder-io/launches/fusion-1-0
2025-11-23 11:03:19.616 | INFO | playwright_get_data:record_click:75 - 记录点击: - 坐标(制作人链接, 点击制作人链接在当前窗口打开) - 选择器:
2025-11-23 11:03:19.616 | INFO | playwright_get_data:extract_maker_statement_from_current_window:169 - 正在在当前窗口打开制作人链接: https://www.producthunt.com/products/builder-io/launches/fusion-1-0
2025-11-23 11:03:51.755 | ERROR | playwright_get_data:extract_maker_statement_from_current_window:220 - 在当前窗口打开制作人链接失败: Timeout 30000ms exceeded.
=========================== logs ===========================
"load" event fired
============================================================
2025-11-23 11:03:51.758 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:03:52.016 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:03:52.016 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: Builder.io
2025-11-23 11:03:52.021 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:03:52.033 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:03:52.035 | INFO | __main__:save_product_info:179 - 新增产品信息: Builder.io
2025-11-23 11:03:52.038 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: Builder.io
2025-11-23 11:03:52.039 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/beebot-for-airpods
2025-11-23 11:03:52.039 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/beebot-for-airpods
2025-11-23 11:03:52.039 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:03:52.675 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:03:52.675 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/beebot-for-airpods
2025-11-23 11:03:55.666 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:03:55.680 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: BeeBot for AirPods: Your social audio guide to the city | Product Hunt
2025-11-23 11:03:55.680 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:03:55.680 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:03:55.681 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:03:55.681 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:03:55.728 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: BeeBot for AirPods
2025-11-23 11:03:55.729 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:03:55.729 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:03:55.741 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: Its like having that friend who knows everything thats happening, except it whispers directly into your ears as you walk around. BeeBot gives you a few short updates a day about people, places, and ...
2025-11-23 11:03:55.741 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:03:55.742 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:03:55.749 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 242 followers
2025-11-23 11:03:55.749 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:03:55.749 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:04:15.761 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:04:15.768 | WARNING | playwright_get_data:extract_product_info:370 - 未找到XPath为//span[contains(@class, "absolute")]的元素
2025-11-23 11:04:15.770 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:04:15.972 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:04:15.973 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: BeeBot for AirPods
2025-11-23 11:04:15.979 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:04:15.988 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:04:15.991 | INFO | __main__:save_product_info:179 - 新增产品信息: BeeBot for AirPods
2025-11-23 11:04:15.994 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: BeeBot for AirPods
2025-11-23 11:04:15.994 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/iisee-me
2025-11-23 11:04:15.995 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/iisee-me
2025-11-23 11:04:15.996 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:04:16.640 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:04:16.641 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/iisee-me
2025-11-23 11:04:29.367 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:04:29.448 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: iisee.me: Create your own AI generated expression grid | Product Hunt
2025-11-23 11:04:29.448 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:04:29.449 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:04:29.449 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:04:29.449 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:04:29.521 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: iisee.me
2025-11-23 11:04:29.521 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:04:29.522 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:04:29.528 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: A silly AI experiment that turns your photo into a grid of faces that track your mouse. Built in under 8 hours just for fun....
2025-11-23 11:04:29.528 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:04:29.528 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:04:29.534 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 172 followers
2025-11-23 11:04:29.535 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:04:29.535 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:04:49.544 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:04:49.552 | WARNING | playwright_get_data:extract_product_info:370 - 未找到XPath为//span[contains(@class, "absolute")]的元素
2025-11-23 11:04:49.553 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:04:49.765 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:04:49.765 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: iisee.me
2025-11-23 11:04:49.769 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:04:49.781 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:04:49.783 | INFO | __main__:save_product_info:179 - 新增产品信息: iisee.me
2025-11-23 11:04:49.786 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: iisee.me
2025-11-23 11:04:49.786 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/melodic-mind-2
2025-11-23 11:04:49.787 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/melodic-mind-2
2025-11-23 11:04:49.787 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:04:50.463 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:04:50.463 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/melodic-mind-2
2025-11-23 11:04:51.994 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:04:52.011 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: Melodic Mind: Create, learn, and grow as a musician | Product Hunt
2025-11-23 11:04:52.011 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:04:52.011 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:04:52.012 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:04:52.012 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:04:52.039 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: Melodic Mind
2025-11-23 11:04:52.039 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:04:52.039 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:04:52.047 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: Melodic Mind is an all-in-one music superapp built to help you create, learn, and grow as a musician — no matter your level. It has 20+ different apps that solve every need you have and help you on yo...
2025-11-23 11:04:52.048 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:04:52.048 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:04:52.053 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 159 followers
2025-11-23 11:04:52.053 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:04:52.053 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:05:12.061 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:05:12.065 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人链接 - 选择器: //span[contains(@class, "absolute")]/parent::a
2025-11-23 11:05:12.074 | INFO | playwright_get_data:extract_product_info:363 - 制作人链接: https://www.producthunt.com/p/melodic-mind-2/q-a-4
2025-11-23 11:05:12.074 | INFO | playwright_get_data:record_click:75 - 记录点击: - 坐标(制作人链接, 点击制作人链接在当前窗口打开) - 选择器:
2025-11-23 11:05:12.075 | INFO | playwright_get_data:extract_maker_statement_from_current_window:169 - 正在在当前窗口打开制作人链接: https://www.producthunt.com/p/melodic-mind-2/q-a-4
2025-11-23 11:05:15.198 | INFO | playwright_get_data:extract_maker_statement_from_current_window:176 - 等待title元素出现并包含产品名称最长等待2分钟...
2025-11-23 11:07:15.214 | ERROR | playwright_get_data:extract_maker_statement_from_current_window:194 - 等待title元素失败: Page.wait_for_selector: Timeout 120000ms exceeded.
Call log:
- waiting for locator("title") to be visible
239 × locator resolved to hidden <title>Q&A : Melodic Mind Discussion Forums | Product Hu…</title>
2025-11-23 11:07:15.214 | INFO | playwright_get_data:extract_maker_statement_from_current_window:197 - 再等待30秒确保页面完全加载...
2025-11-23 11:07:45.227 | INFO | playwright_get_data:extract_maker_statement_from_current_window:201 - 正在提取制作人评论内容...
2025-11-23 11:07:45.231 | WARNING | playwright_get_data:extract_maker_statement_from_current_window:213 - 未找到XPath为//*[@id="comment-4597755"]/div/div[2]/div/div/div的元素
2025-11-23 11:07:45.233 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:07:45.476 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:07:45.479 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: Melodic Mind
2025-11-23 11:07:45.483 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:07:45.495 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:07:45.496 | INFO | __main__:save_product_info:179 - 新增产品信息: Melodic Mind
2025-11-23 11:07:45.499 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: Melodic Mind
2025-11-23 11:07:45.499 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/agor
2025-11-23 11:07:45.500 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/agor
2025-11-23 11:07:45.500 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:07:46.146 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:07:46.146 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/agor
2025-11-23 11:07:49.097 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:07:49.112 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: agor: Orchestrate multiple AI coding agents with your team | Product Hunt
2025-11-23 11:07:49.112 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:07:49.113 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:07:49.113 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:07:49.113 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:07:49.185 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: agor
2025-11-23 11:07:49.186 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:07:49.186 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:07:49.191 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: Next-gen agent orchestration for AI coding. Multiplayer workspace for Claude Code, Codex, and Gemini....
2025-11-23 11:07:49.191 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:07:49.191 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:07:49.199 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 133 followers
2025-11-23 11:07:49.199 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:07:49.200 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:08:09.216 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:08:09.223 | WARNING | playwright_get_data:extract_product_info:370 - 未找到XPath为//span[contains(@class, "absolute")]的元素
2025-11-23 11:08:09.226 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:08:09.428 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:08:09.428 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: agor
2025-11-23 11:08:09.433 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:08:09.442 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:08:09.444 | INFO | __main__:save_product_info:179 - 新增产品信息: agor
2025-11-23 11:08:09.447 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: agor
2025-11-23 11:08:09.447 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/quiteinbox
2025-11-23 11:08:09.448 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/quiteinbox
2025-11-23 11:08:09.448 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:08:10.097 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:08:10.097 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/quiteinbox
2025-11-23 11:08:11.298 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:08:11.306 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: QuiteInbox: Take back control of your inbox | Product Hunt
2025-11-23 11:08:11.307 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:08:11.308 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:08:11.308 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:08:11.308 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:08:11.337 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: QuiteInbox
2025-11-23 11:08:11.338 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:08:11.338 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:08:11.344 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: Unsubscribe from unwanted emails in seconds. No servers. No tracking. Everything happens locally in your browser. 100% free and open source....
2025-11-23 11:08:11.344 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:08:11.345 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:08:11.354 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 149 followers
2025-11-23 11:08:11.355 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:08:11.355 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:08:31.367 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:08:31.370 | WARNING | playwright_get_data:extract_product_info:370 - 未找到XPath为//span[contains(@class, "absolute")]的元素
2025-11-23 11:08:31.372 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:08:31.590 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:08:31.590 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: QuiteInbox
2025-11-23 11:08:31.595 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:08:31.604 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:08:31.607 | INFO | __main__:save_product_info:179 - 新增产品信息: QuiteInbox
2025-11-23 11:08:31.610 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: QuiteInbox
2025-11-23 11:08:31.610 | INFO | __main__:run_scraping:254 - 处理URL: https://www.producthunt.com/products/everywhere
2025-11-23 11:08:31.611 | INFO | __main__:scrape_product_info:192 - 开始抓取: https://www.producthunt.com/products/everywhere
2025-11-23 11:08:31.611 | INFO | playwright_get_data:connect_to_existing_chrome:30 - 正在连接到Chrome远程调试端口 9222
2025-11-23 11:08:32.245 | SUCCESS | playwright_get_data:connect_to_existing_chrome:57 - 成功连接到Chrome浏览器
2025-11-23 11:08:32.246 | INFO | playwright_get_data:navigate_to_producthunt:111 - 正在访问: https://www.producthunt.com/products/everywhere
2025-11-23 11:08:33.776 | INFO | playwright_get_data:navigate_to_producthunt:116 - 等待页面标题包含'Product Hunt'...
2025-11-23 11:08:33.813 | INFO | playwright_get_data:navigate_to_producthunt:124 - 当前页面标题: Everywhere: Every moment, Every place. Your AI: Everywhere | Product Hunt
2025-11-23 11:08:33.813 | SUCCESS | playwright_get_data:navigate_to_producthunt:128 - 页面标题已包含'Product Hunt',等待时间: 0秒
2025-11-23 11:08:33.813 | SUCCESS | playwright_get_data:navigate_to_producthunt:129 - Product Hunt网站已成功打开
2025-11-23 11:08:33.813 | INFO | playwright_get_data:extract_product_info:291 - 正在提取产品名称...
2025-11-23 11:08:33.813 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品名称 - 选择器: //h1
2025-11-23 11:08:33.897 | INFO | playwright_get_data:extract_product_info:297 - 产品名称: Everywhere
2025-11-23 11:08:33.897 | INFO | playwright_get_data:extract_product_info:304 - 正在提取产品简介...
2025-11-23 11:08:33.897 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 产品简介 - 选择器: //*[@class="relative text-16 font-normal text-gray-700"]//div
2025-11-23 11:08:33.904 | INFO | playwright_get_data:extract_product_info:310 - 产品简介: Everywhere is dedicated to liberating AI from browser tabs and standalone apps, making it a ubiquitous, native capability of your operating system. We believe true productivity gains stem from the sea...
2025-11-23 11:08:33.904 | INFO | playwright_get_data:extract_product_info:317 - 正在提取用户数...
2025-11-23 11:08:33.904 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 用户数 - 选择器: //*[@class="flex flex-row gap-2"]//div/div[2]/span/p
2025-11-23 11:08:33.911 | INFO | playwright_get_data:extract_product_info:323 - 用户数: 204 followers
2025-11-23 11:08:33.912 | INFO | playwright_get_data:extract_product_info:330 - 正在提取制作人发言链接...
2025-11-23 11:08:33.912 | INFO | playwright_get_data:extract_product_info:333 - 等待页面元素加载...
2025-11-23 11:08:53.915 | INFO | playwright_get_data:record_dom_selection:86 - 记录DOM选取: 制作人span标签 - 选择器: //span[contains(@class, "absolute")]
2025-11-23 11:08:53.920 | WARNING | playwright_get_data:extract_product_info:370 - 未找到XPath为//span[contains(@class, "absolute")]的元素
2025-11-23 11:08:53.921 | INFO | playwright_get_data:extract_product_info:384 - 产品信息已保存到临时文件: temp_product_info.txt
2025-11-23 11:08:54.140 | INFO | playwright_get_data:extract_product_info:389 - 页面截图已保存到: product_screenshot.png
2025-11-23 11:08:54.140 | SUCCESS | __main__:scrape_product_info:214 - 成功提取产品信息: Everywhere
2025-11-23 11:08:54.145 | INFO | playwright_get_data:close:401 - 浏览器连接已关闭
2025-11-23 11:08:54.155 | INFO | playwright_get_data:close:405 - Playwright实例已关闭
2025-11-23 11:08:54.158 | INFO | __main__:save_product_info:179 - 新增产品信息: Everywhere
2025-11-23 11:08:54.162 | SUCCESS | __main__:run_scraping:270 - 成功保存产品信息: Everywhere
2025-11-23 11:08:54.163 | INFO | __main__:show_scraping_results:303 - === 抓取结果统计 ===
2025-11-23 11:08:54.163 | INFO | __main__:show_scraping_results:304 - 成功抓取: 9 个产品
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:305 - 跳过重复: 1 个链接
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:306 - 抓取失败: 0 个链接
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:307 - 数据库中的产品总数: 10
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:310 - 最新抓取的产品:
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:312 - - Everywhere: https://www.producthunt.com/products/everywhere
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:312 - - QuiteInbox: https://www.producthunt.com/products/quiteinbox
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:312 - - agor: https://www.producthunt.com/products/agor
2025-11-23 11:08:54.164 | INFO | __main__:show_scraping_results:312 - - Melodic Mind: https://www.producthunt.com/products/melodic-mind-2
2025-11-23 11:08:54.165 | INFO | __main__:show_scraping_results:312 - - iisee.me: https://www.producthunt.com/products/iisee-me
2025-11-23 11:08:54.165 | INFO | __main__:show_scraping_results:312 - - BeeBot for AirPods: https://www.producthunt.com/products/beebot-for-airpods
2025-11-23 11:08:54.165 | INFO | __main__:show_scraping_results:312 - - Builder.io: https://www.producthunt.com/products/builder-io
2025-11-23 11:08:54.165 | INFO | __main__:show_scraping_results:312 - - American Ratings Lead Magnet Portal: https://www.producthunt.com/products/american-ratings-lead-magnet-portal
2025-11-23 11:08:54.165 | INFO | __main__:show_scraping_results:312 - - Pixley AI: https://www.producthunt.com/products/pixley-ai
2025-11-23 11:08:54.165 | INFO | __main__:show_scraping_results:312 - - Burner: https://www.producthunt.com/products/burner-2
2025-11-23 11:08:54.165 | SUCCESS | __main__:run_scraping:284 - === ProductHunt数据抓取完成 ===

View File

@@ -1,11 +0,0 @@
=== Product Hunt 产品信息 ===
产品名称: Everywhere
产品简介: Everywhere is dedicated to liberating AI from browser tabs and standalone apps, making it a ubiquitous, native capability of your operating system. We believe true productivity gains stem from the seamless integration of AI with your current tasks. Unlike conventional tools like ChatGPT, Everywhere perceives and understands any content on your screen in real-time. No need to screenshot, copy, or switch apps—simply use a hotkey to get the help you need, right where you are.
制作人发言: 未获取
用户数: 204 followers
提取时间: 2025-11-23 11:08:53