From d79051cb241a130829910a60ee1a4db00a5c0699 Mon Sep 17 00:00:00 2001
From: xiaji <rembme@163.com>
Date: Sun, 9 Nov 2025 20:30:41 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E4=BA=86=E5=85=A5?=
 =?UTF-8?q?=E5=BA=93=E7=9A=84=E9=80=BB=E8=BE=91=EF=BC=8C=E5=A6=82=E6=9E=9C?=
 =?UTF-8?q?3=E5=A4=A9=E5=86=85=E5=B7=B2=E5=AD=98=E5=9C=A8=E7=9B=B8?=
 =?UTF-8?q?=E5=90=8C=E6=A0=87=E9=A2=98=E7=9A=84=E6=96=87=E7=AB=A0=EF=BC=8C?=
 =?UTF-8?q?=E5=88=99=E8=B7=B3=E8=BF=87=E5=85=A5=E5=BA=93=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tophub_add_data_to_db.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tophub_add_data_to_db.py b/tophub_add_data_to_db.py
index f0f6b99..d0f5233 100644
--- a/tophub_add_data_to_db.py
+++ b/tophub_add_data_to_db.py
@@ -84,19 +84,36 @@ def parse_file_content(file_path):
         return []
 
 def check_duplicate(title, date_str):
-    """检查标题+日期是否已存在"""
+    """检查标题在最近三天（前天、昨天和今天）是否已存在"""
+    from datetime import datetime, timedelta
+    
     conn = sqlite3.connect('tophub_data.db')
     cursor = conn.cursor()
     
-    cursor.execute('''
-        SELECT COUNT(*) FROM articles 
-        WHERE title = ? AND source_date = ?
-    ''', (title, date_str))
-    
-    count = cursor.fetchone()[0]
-    conn.close()
-    
-    return count > 0
+    try:
+        # 将输入日期字符串转换为datetime对象
+        current_date = datetime.strptime(date_str, '%Y-%m-%d')
+        
+        # 计算前天、昨天和今天的日期
+        yesterday = current_date - timedelta(days=1)
+        day_before_yesterday = current_date - timedelta(days=2)
+        
+        # 检查这三天内是否有相同标题的文章
+        cursor.execute('''
+            SELECT COUNT(*) FROM articles 
+            WHERE title = ? AND source_date IN (?, ?, ?)
+        ''', (title, 
+              day_before_yesterday.strftime('%Y-%m-%d'),
+              yesterday.strftime('%Y-%m-%d'),
+              date_str))
+        
+        count = cursor.fetchone()[0]
+        logger.info(f"检查标题 '{title}' 在最近三天的重复情况: 找到 {count} 条相同记录")
+        
+        return count > 0
+        
+    finally:
+        conn.close()
 
 def classify_title(title):
     """调用API对标题进行分类"""
@@ -193,7 +210,7 @@ def process_temp_files():
             
             # 检查重复
             if check_duplicate(article['title'], source_date):
-                logger.info(f"跳过重复文章: {article['title']}")
+                logger.info(f"跳过重复文章(最近三天已存在): {article['title']}")
                 continue
             
             # 分类标题