参考文献精准+模糊双重搜索

import re import os import urllib.parse def clean_text_smart(text): """ 智能清洗逻辑，尝试提取标题，但保持克制，避免误删。 """ original = text text = text.strip() # 1. 去除结尾的引用类型标识 [J], [M], [C] 等 # 匹配模式：[字母] 结尾，或者 [字母]. 结尾 text = re.sub(r'\[[a-zA-Z]+\][.]?.*$', '', text) # 2. 去除结尾的页码/卷期信息 (保守策略) # 例如: ", 44: 173-245." -> 去掉 text = re.sub(r'[,.]?\s*\d+[:]\s*\d+[-]\d+.*$', '', text) # 3. 处理前缀：作者和年份 # A. 基于年份的强分割 (19xx 或 20xx) # 匹配 "(2018)." 或 ", 2011." 或 ". 2017." year_match = re.search(r'([,.(]\s*(?:19|20)\d{2}[).]?)\s*', text) if year_match: # 如果年份出现在前半段，大概率年份后面是标题 if year_match.start() < len(text) * 0.6: potential_title = text[year_match.end():].strip() # 只有当剩下的内容长度足够时，才采纳，防止只剩标点 if len(potential_title) > 5: text = potential_title # B. 基于 "et al" 或 "等" 的分割 if 'et al' in text: parts = re.split(r'et al[.,]*\s*', text) if len(parts) > 1 and len(parts[-1]) > 5: text = parts[-1] elif '等' in text: parts = re.split(r'等[.,]*\s*', text) if len(parts) > 1 and len(parts[-1]) > 5: text = parts[-1] # C. 处理纯作者名的情况 (兜底) # 如果经过上述处理，文本依然包含大量逗号且没有长句子，可能根本没提取出来 # 此时保留原样交给谷歌去猜 # 清理首尾标点 text = text.strip(' .,;') # 如果清洗后变成了空或者太短，返回原文本 if len(text) < 3: return original return text # --- 主程序 --- current_dir = os.path.dirname(os.path.abspath(__file__)) input_path = os.path.join(current_dir, 'input.txt') output_html_path = os.path.join(current_dir, 'citations_search.html') print(f"📂 读取文件: {input_path}") try: with open(input_path, 'r', encoding='utf-8') as f: lines = f.readlines() except: try: with open(input_path, 'r', encoding='gb18030') as f: lines = f.readlines() except: print("❌ 无法读取文件，请检查编码！") exit() # HTML 头部 html_content = """ 参考文献精准+模糊双重搜索

📚 参考文献修正与搜索工具

使用策略：
1. 优先点击 [提取版搜索]，这是脚本尝试去掉作者和年份后的结果。
2. 如果结果不对（比如没搜到），点击 [原文搜索]，直接让谷歌处理原始文本。
3. 在谷歌学术页面点击引用按钮 (引号图标) -> 复制 GB/T 7714。

""" count = 0 for line in lines: line = line.strip() if not line: continue count += 1 # 获取智能清洗后的标题 clean_title = clean_text_smart(line) # 构建链接 url_clean = f"https://scholar.google.com/scholar?q={urllib.parse.quote(clean_title)}" url_raw = f"https://scholar.google.com/scholar?q={urllib.parse.quote(line)}" html_content += f"""

#{count} {line[:80]}...

🔍 提取版搜索 📄 原文搜索

""" html_content += """

""" with open(output_html_path, 'w', encoding='utf-8') as f: f.write(html_content) print(f"\n✅ 处理完成！共 {count} 条文献。") print(f"👉 请双击打开文件: {output_html_path}")