Skip to content

tools.spider.process_spider_bib

Classes

ProcessSpiderBib

ProcessSpiderBib(path_abbr, abbr_standard)

Process spider bib.

Parameters:

Name Type Description Default
path_abbr str

The path of the abbreviation folder.

required
abbr_standard str

The standard abbreviation.

required

Attributes:

Name Type Description
path_abbr

The path of the abbreviation folder.

abbr_standard

The standard abbreviation.

Source code in pybibtexer/tools/spider/process_spider_bib.py
def __init__(self, path_abbr: str, abbr_standard: str) -> None:
    self.path_abbr = os.path.expandvars(os.path.expanduser(path_abbr))
    self.abbr_standard = abbr_standard

    self._options = {
        "is_standardize_bib": True,  # default is True
        "substitute_old_list": [
            r"(<[a-zA-Z\-]+\s*/*\s*>)",
            r"(</[a-zA-Z\-]+>)",
            r'(<[a-zA-Z\-]+ [^\s]+="[^>]+?"\s*/*\s*>)',
            r"([ ]+)",
            r";[; ]*;",
            r",[, ]*,",
        ],
        "substitute_new_list": ["", "", "", " ", ";", ","],
        "choose_abbr_zotero_save": "save",  # default is "save"
        "delete_field_list_for_save": [],  # default is []
        "is_sort_entry_fields": True,  # default is False
        "is_sort_blocks": True,  # default is False
        "sort_entries_by_field_keys_reverse": False,  # default is True
        "empty_entry_cite_keys": True,
    }

    self._python_bib = PythonRunBib(self._options)

Functions

check_spider_bib
check_spider_bib(delete_duplicate_in_bibs=False)

Check bib.

Source code in pybibtexer/tools/spider/process_spider_bib.py
def check_spider_bib(self, delete_duplicate_in_bibs: bool = False) -> None:
    """Check bib."""
    bibs_name = iterate_obtain_full_file_names(self.path_abbr, ".bib", False)
    bibs_name = [[f, os.path.basename(f).split(".")[0].strip()] for f in bibs_name]

    urls_name = iterate_obtain_full_file_names(self.path_abbr, ".csv", False)
    urls_name = [[f, os.path.basename(f).split(".")[0].strip()] for f in urls_name]

    url_base_names = [name[-1] for name in urls_name]

    _options = {}
    _options.update(self._options)
    _python_writer = PythonWriters(_options)

    for name in bibs_name:
        bib_base_name = name[-1]
        if bib_base_name not in url_base_names:
            print(f"{bib_base_name}.csv not in the folder `url`.")
            continue

        full_bib, full_url = name[0], urls_name[url_base_names.index(bib_base_name)][0]

        print("*" * 5 + f" Check {os.path.basename(full_bib)} and {os.path.basename(full_url)} " + "*" * 5)
        bib_list = read_list(full_bib, "r")

        # Check duplicated blocks in bib file
        library = self._python_bib.parse_to_single_standard_library(bib_list)

        url_bib_dict = {}
        for entry in library.entries:
            doi = entry["doi"] if "doi" in entry else ""
            url_ = entry["url"] if "url" in entry else ""
            url = doi if doi else url_
            url_bib_dict.setdefault(url, []).append(entry)

        duplicate_url, new_entries = [], []
        for url in url_bib_dict:
            if len(url_bib_dict[url]) > 1:
                duplicate_url.append(url)
            if delete_duplicate_in_bibs:
                new_entries.append(url_bib_dict[url][0])

        # Delete duplicated blocks in bib file
        if duplicate_url:
            print(f"Duplicates in {full_bib}: {duplicate_url}\n")
        if duplicate_url and delete_duplicate_in_bibs:
            _python_writer.write_to_file(new_entries, full_bib, "w", None, False)
    return None
format_spider_bib
format_spider_bib(write_bib=False)

Format spider bib.

Source code in pybibtexer/tools/spider/process_spider_bib.py
def format_spider_bib(self, write_bib: bool = False) -> None:
    """Format spider bib."""
    file_list = iterate_obtain_full_file_names(self.path_abbr, ".bib", False)

    if write_bib:
        if os.path.exists(readme := os.path.join(self.path_abbr, "README.md")):
            os.remove(readme)

    _options = {}
    _options.update(self._options)
    _python_writer = PythonWriters(_options)

    for f in file_list:
        print("*" * 5 + f" Format {os.path.basename(f)} " + "*" * 5)

        data_list = read_list(f, "r")

        # standardize
        entry_type_year_volume_number_month_entry_dict = self._python_bib.parse_to_nested_entries_dict(data_list)
        if not write_bib:
            continue

        # just for the necessary part
        old_readme_md = [re.sub(r"[ ]+", "", line) for line in read_list("README.md", "r", self.path_abbr)]
        new_readme_md = []
        new_entry_list = []

        for entry_type in entry_type_year_volume_number_month_entry_dict:
            new_dict = entry_type_year_volume_number_month_entry_dict.get(entry_type.lower(), {})

            # for README.md
            readme_md = generate_readme(self.abbr_standard, entry_type.lower(), new_dict)
            readme_md = readme_md[3:] if (old_readme_md or new_readme_md) else readme_md
            readme_md = [line for line in readme_md if re.sub(r"[ ]+", "", line) not in old_readme_md]
            new_readme_md.extend(readme_md)

            # for bib
            entry_list = IterateCombineExtendDict().dict_update(copy.deepcopy(new_dict))
            new_entry_list.extend(entry_list)

        write_list(new_readme_md, "README.md", "a", self.path_abbr, False)
        _python_writer.write_to_file(new_entry_list, f, "w", None, False)
    return None

Functions