论文引用转换小工具

起因是最近写论文,引用参考文献时图方便直接用arxiv的citation了,但是貌似这样不太好,已经发表的论文依然显示publisher是arxiv,所以就想写个工具转换一下.

首先我在网上找了找类似的工具,出现比较多的是yuchenlin/rebiber: A simple tool to update bib entries with their official information (e.g., DBLP or the ACL anthology). (github.com),还有个在线体验地址Rebiber - a Hugging Face Space by yuchenlin.

这个工具主要利用NLP会议的一些信息来转换,但是有些还是无法成功.

比如有一篇2023CVPR的论文我转换就失败了,依然显示arxiv的链接.

所以我就想了个办法(我之前也找到了其他人也用的类似这个方法,但我找不到链接了),使用谷歌学术上的引用.处理逻辑很简单,分析bib文件中的每个条目,看哪些是arxiv的然后就在谷歌学术上搜索并替换,当然不排除有的文章很牛但还是只有arxiv版本,特别是在深度学习中,一些大牛这样干.

上面的方法会出一些问题,比如爬着爬着就出现谷歌的验证,因为没有js,会提示不允许访问,需要进行验证,所以只能再变成selenium的方式,使用webdriver通过验证.

全部代码如下.

from pathlib import Path
import requests
import bibtexparser as bp
from lxml import etree
import time
from requests.compat import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options

scholar_site = "https://scholar.google.com/scholar"  # https://scholar.google.com/scholar
bib_url = "https://scholar.google.cz/scholar"


class Paser:
    def __init__(self, parse_type='b'):
        if parse_type == 'b':
            edge_ops = Options()
            # edge_ops.add_argument('--headless')
            self.driver = webdriver.Edge(options=edge_ops)
            self.flag = True

    def search_thesis(self, title: str):
        response = requests.get(scholar_site, params={'q': title, 'hl': 'zh-CN'})
        root = etree.HTML(response.text)
        nodes = root.xpath('//div[@id="gs_res_ccl_mid"]')
        assert len(nodes) == 1, "No thesis found."
        node = nodes[0]
        first_theis = node.xpath('//div[contains(@class, "gs_r") and contains(@class, "gs_or") and contains(@class, '
                                 '"gs_scl") and @data-cid]')
        if len(first_theis) != 1:
            raise Exception("No thesis found.")
        data_cid = first_theis[0].get("data-cid")
        res = requests.get(bib_url,
                           params={'q': f"info:{data_cid}:scholar.google.com/", 'output': 'cite', 'hl': 'zh-CN',
                                   'scirp': '0'})
        time.sleep(2)
        return self.get_bib_from_scholar(res.text)

    def get_bib_from_scholar(self, res: str):
        root = etree.HTML(res)
        bibtex_url = root.xpath('//div[@id="gs_citi"]')[0].xpath('//a')[0].get('href')
        bib_res = requests.get(bibtex_url)
        time.sleep(2)
        return bib_res.text

    def search_thesis_from_browser(self, title: str):
        url_query = urljoin(scholar_site, f"?q={title}&hl='zh-CN'")
        self.driver.get(url_query)
        print("Searching thesis information.")
        if self.flag:
            try:
                captcha = self.driver.find_element(By.XPATH, '//*[@id="gs_captcha_c"]')
                if captcha:
                    print("Captcha detected.")
                    self.driver.implicitly_wait(100)
                    while True:
                        print("Please solve the captcha.")
                        if not captcha.is_displayed():
                            self.flag = False
                            break
            except Exception as e:
                self.flag = False
        print("Retrieving thesis information.")
        time.sleep(3)
        ref = self.driver.find_element(By.XPATH, '//div[@id="gs_res_ccl_mid"]/*[1]//div[@class="gs_fl gs_flb"]/a[2]')
        ref.click()
        self.driver.implicitly_wait(10)
        bib_link = self.driver.find_element(By.XPATH, '//*[@id="gs_citi"]/a[1]')
        time.sleep(2)
        bib_link.click()
        self.driver.implicitly_wait(10)
        time.sleep(2)
        bib_text = self.driver.find_element(By.TAG_NAME, 'pre').text
        return bib_text


def parse_bib(file_path: str, output_file_path: str = None):
    global archiveprefix
    if output_file_path is None:
        output_file_path = Path(file_path).stem + "_parsed.bib"
    if not Path(output_file_path).exists():
        Path(output_file_path).touch()
    with open(output_file_path, "w") as f:
        f.write("")
    bib_content = bp.parse_file(file_path)
    if len(bib_content.failed_blocks) > 0:
        raise Exception(
            '\033[92m' + "Some blocks failed to parse. Check the entries of `library.failed_blocks`." + '\033[0m')
    else:
        print("All blocks parsed successfully")
    parser = Paser()
    for idx in range(len(bib_content.entries)):
        try:
            publisher = bib_content.entries[idx]['publisher']
            publisher = publisher.replace("{", "").replace("}", "")
        except Exception as e:
            publisher = ""
            try:
                archiveprefix = bib_content.entries[idx]['archiveprefix']
                archiveprefix = archiveprefix.replace("{", "").replace("}", "")
            except Exception as e:
                archiveprefix = ""
        if publisher == "arXiv" or archiveprefix == "arxiv":
            title = bib_content.entries[idx]['title']
            title = title.replace("{{", "").replace("}}", "")
            key = bib_content.entries[idx].key
            try:
                lib = bp.parse_string(parser.search_thesis_from_browser(title=title))
            except Exception as e:
                print(f"Error: {e}")
                continue
            lib.entries[0].key = key
            new_bibtex_str = bp.write_string(lib)
            with open(output_file_path, "a",encoding="utf-8") as f:
                f.write(new_bibtex_str)
        else:
            lib = bp.Library()
            lib.add(bib_content.entries[idx])
            lib_str = bp.write_string(lib)
            with open(output_file_path, "a", encoding="utf-8") as f:
                f.write(lib_str)
    assert len(bib_content.entries) == len(bp.parse_file(output_file_path).entries), "Some entries are missing."
    print(f"File saved to {output_file_path}")

if __name__ == '__main__':
    parse_bib('references.bib')

解析bib文件使用bibtexparser这个库.

这里我使用Edge浏览器,换成谷歌浏览器也可以,另外如果在国内,需要开全局代理.

处理完后建议检查一下.

参考资料

Full API — BibtexParser latest documentation