import re
import os
import urllib.parse

def clean_text_smart(text):
    """
    智能清洗逻辑，尝试提取标题，但保持克制，避免误删。
    """
    original = text
    text = text.strip()

    # 1. 去除结尾的引用类型标识 [J], [M], [C] 等
    # 匹配模式：[字母] 结尾，或者 [字母]. 结尾
    text = re.sub(r'\[[a-zA-Z]+\][.]?.*$', '', text)

    # 2. 去除结尾的页码/卷期信息 (保守策略)
    # 例如: ", 44: 173-245." -> 去掉
    text = re.sub(r'[,.]?\s*\d+[:]\s*\d+[-]\d+.*$', '', text)

    # 3. 处理前缀：作者和年份
    
    # A. 基于年份的强分割 (19xx 或 20xx)
    # 匹配 "(2018)." 或 ", 2011." 或 ". 2017."
    year_match = re.search(r'([,.(]\s*(?:19|20)\d{2}[).]?)\s*', text)
    if year_match:
        # 如果年份出现在前半段，大概率年份后面是标题
        if year_match.start() < len(text) * 0.6:
            potential_title = text[year_match.end():].strip()
            # 只有当剩下的内容长度足够时，才采纳，防止只剩标点
            if len(potential_title) > 5:
                text = potential_title

    # B. 基于 "et al" 或 "等" 的分割
    if 'et al' in text:
        parts = re.split(r'et al[.,]*\s*', text)
        if len(parts) > 1 and len(parts[-1]) > 5:
            text = parts[-1]
    elif '等' in text:
        parts = re.split(r'等[.,]*\s*', text)
        if len(parts) > 1 and len(parts[-1]) > 5:
            text = parts[-1]

    # C. 处理纯作者名的情况 (兜底)
    # 如果经过上述处理，文本依然包含大量逗号且没有长句子，可能根本没提取出来
    # 此时保留原样交给谷歌去猜
    
    # 清理首尾标点
    text = text.strip(' .,;')
    
    # 如果清洗后变成了空或者太短，返回原文本
    if len(text) < 3:
        return original
        
    return text

# --- 主程序 ---

current_dir = os.path.dirname(os.path.abspath(__file__))
input_path = os.path.join(current_dir, 'input.txt')
output_html_path = os.path.join(current_dir, 'citations_search.html')

print(f"📂 读取文件: {input_path}")

try:
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
except:
    try:
        with open(input_path, 'r', encoding='gb18030') as f:
            lines = f.readlines()
    except:
        print("❌ 无法读取文件，请检查编码！")
        exit()

# HTML 头部
html_content = """
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>参考文献精准+模糊双重搜索</title>
    <style>
        body { font-family: 'Segoe UI', Arial, sans-serif; padding: 20px; background: #f5f5f5; }
        .container { max-width: 900px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
        h2 { border-bottom: 2px solid #4285f4; padding-bottom: 10px; color: #333; }
        .tip { background: #e8f0fe; color: #1a73e8; padding: 15px; border-radius: 5px; margin-bottom: 20px; }
        .row { border-bottom: 1px solid #eee; padding: 15px 0; display: flex; align-items: center; justify-content: space-between; }
        .row:hover { background: #fafafa; }
        .text-area { flex: 1; padding-right: 20px; font-size: 14px; color: #555; word-break: break-all; }
        .btn-group { min-width: 200px; display: flex; gap: 10px; }
        .btn { text-decoration: none; padding: 8px 15px; border-radius: 4px; font-size: 13px; font-weight: bold; transition: all 0.2s; text-align: center; }
        .btn-primary { background: #4285f4; color: white; border: 1px solid #4285f4; }
        .btn-primary:hover { background: #3367d6; }
        .btn-secondary { background: white; color: #5f6368; border: 1px solid #dadce0; }
        .btn-secondary:hover { background: #f1f3f4; color: #202124; }
        .tag { font-size: 12px; background: #eee; padding: 2px 6px; border-radius: 4px; color: #666; margin-right: 5px; }
    </style>
</head>
<body>
    <div class="container">
        <h2>📚 参考文献修正与搜索工具</h2>
        <div class="tip">
            <strong>使用策略：</strong><br>
            1. 优先点击 <strong>[提取版搜索]</strong>，这是脚本尝试去掉作者和年份后的结果。<br>
            2. 如果结果不对（比如没搜到），点击 <strong>[原文搜索]</strong>，直接让谷歌处理原始文本。<br>
            3. 在谷歌学术页面点击引用按钮 (引号图标) -> 复制 <strong>GB/T 7714</strong>。
        </div>
"""

count = 0
for line in lines:
    line = line.strip()
    if not line: continue
    
    count += 1
    # 获取智能清洗后的标题
    clean_title = clean_text_smart(line)
    
    # 构建链接
    url_clean = f"https://scholar.google.com/scholar?q={urllib.parse.quote(clean_title)}"
    url_raw = f"https://scholar.google.com/scholar?q={urllib.parse.quote(line)}"
    
    html_content += f"""
        <div class="row">
            <div class="text-area">
                <span class="tag">#{count}</span> {line[:80]}...
            </div>
            <div class="btn-group">
                <a href="{url_clean}" target="_blank" class="btn btn-primary">🔍 提取版搜索</a>
                <a href="{url_raw}" target="_blank" class="btn btn-secondary">📄 原文搜索</a>
            </div>
        </div>
    """

html_content += """
    </div>
</body>
</html>
"""

with open(output_html_path, 'w', encoding='utf-8') as f:
    f.write(html_content)

print(f"\n✅ 处理完成！共 {count} 条文献。")
print(f"👉 请双击打开文件: {output_html_path}")