tools¶

Classes¶

Functions¶

generate_nested_dict ¶

generate_nested_dict(path_storage)

Generate a nested dictionary structure representing directory hierarchy.

Recursively walks through a directory tree and constructs a nested dictionary that mirrors the folder structure with files organized under their respective directories.

Parameters:

Name	Type	Description	Default
`path_storage`	`str`	Root directory path to generate structure from.	required

Returns:

Name	Type	Description
`dict`	`dict`	Nested dictionary representing directory hierarchy with sorted file lists.

Examples:

>>> generate_nested_dict("/path/to/project")
{
    'folder1': {
        'subfolder1': ['file1.txt', 'file2.txt'],
        'subfolder2': ['file3.txt']
    },
    'folder2': ['file4.txt']
}

Source code in pyadvtools/tools.py

def generate_nested_dict(path_storage: str) -> dict:
    """Generate a nested dictionary structure representing directory hierarchy.

    Recursively walks through a directory tree and constructs a nested
    dictionary that mirrors the folder structure with files organized
    under their respective directories.

    Args:
        path_storage: Root directory path to generate structure from.

    Returns:
        dict: Nested dictionary representing directory hierarchy with sorted file lists.

    Examples:
        >>> generate_nested_dict("/path/to/project")
        {
            'folder1': {
                'subfolder1': ['file1.txt', 'file2.txt'],
                'subfolder2': ['file3.txt']
            },
            'folder2': ['file4.txt']
        }
    """
    # Initialize dictionary to store flat file structure
    files_dict = {}

    # Recursively walk through all directories and subdirectories
    for root, _, files in os.walk(path_storage, topdown=True):
        for file in files:
            # Create relative path key by removing root path prefix
            # Use os.path.normpath for cross-platform path handling
            relative_path = os.path.normpath(os.path.join(root, file))
            relative_to_storage = os.path.relpath(relative_path, path_storage)
            f = "." + os.path.sep + relative_to_storage

            # Group files by their relative directory path
            relative_dir = os.path.relpath(root, path_storage)
            files_dict.setdefault(relative_dir, []).append(f)

    # Sort file lists alphabetically for each directory
    files_dict = {k: sorted(v) for k, v in files_dict.items()}

    # Initialize nested dictionary structure
    nested_dict = {}

    # Convert flat directory structure to nested hierarchy
    for k, v in files_dict.items():
        # Split path into individual directory components
        keys = [k for k in Path(k).parts if k != os.sep]

        # Skip empty paths (root directory case)
        if not keys:
            continue

        # Create nested dictionary structure for current path
        temp_dict = {keys[-1]: v}
        # Build nested structure backwards (from deepest to shallowest level)
        for j in keys[::-1][1:]:
            temp_dict = {j: temp_dict}

        # Merge the temporary nested structure into the main nested dictionary
        nested_dict = IterateUpdateDict().dict_update(nested_dict, temp_dict)

    # Recursively sort the nested dictionary structure
    nested_dict = IterateSortDict().dict_update(nested_dict)
    return nested_dict

iterate_obtain_full_file_names ¶

iterate_obtain_full_file_names(
    path_storage,
    extension,
    reverse=True,
    is_standard_file_name=True,
    search_year_list=[],
)

Recursively retrieve full file paths with specified extension.

Walks through a directory tree and collects files matching the given extension, with optional filtering based on year patterns and sorting.

Parameters:

Name	Type	Description	Default
`path_storage`	`str`	Root directory path to search for files.	required
`extension`	`str`	Target file extension to filter (e.g., 'txt', 'csv').	required
`reverse`	`bool`	If True, sorts files in reverse order; otherwise natural order.	`True`
`is_standard_file_name`	`bool`	If True, enables year-based filtering.	`True`
`search_year_list`	`list[str]`	List of years to filter filenames.	`[]`

Returns:

Type	Description
`list[str]`	List[str]: List of full file paths matching criteria, sorted accordingly.

Examples:

>>> files = iterate_obtain_full_file_names("/path", "txt", True, True, ["2023"])
# Returns all .txt files from 2023, sorted in reverse order

Source code in pyadvtools/tools.py

def iterate_obtain_full_file_names(
    path_storage: str,
    extension: str,
    reverse: bool = True,
    is_standard_file_name: bool = True,
    search_year_list: list[str] = [],
) -> list[str]:
    """Recursively retrieve full file paths with specified extension.

    Walks through a directory tree and collects files matching the given
    extension, with optional filtering based on year patterns and sorting.

    Args:
        path_storage: Root directory path to search for files.
        extension: Target file extension to filter (e.g., 'txt', 'csv').
        reverse: If True, sorts files in reverse order; otherwise natural order.
        is_standard_file_name: If True, enables year-based filtering.
        search_year_list: List of years to filter filenames.

    Returns:
        List[str]: List of full file paths matching criteria, sorted accordingly.

    Examples:
        >>> files = iterate_obtain_full_file_names("/path", "txt", True, True, ["2023"])
        # Returns all .txt files from 2023, sorted in reverse order
    """
    # Return empty list if the target directory does not exist
    if not os.path.exists(path_storage):
        return []

    # Compile regex pattern for year filtering if enabled and years are provided
    regex = None
    if is_standard_file_name and search_year_list:
        # Create regex pattern matching any of the specified years, such as AAAI_2020.bib
        regex = re.compile(f"({'|'.join(search_year_list)})")

    file_list = []
    # Recursively walk through all directories and subdirectories
    for root, _, files in os.walk(path_storage, topdown=True):
        # Filter files by the target extension (handling double dots edge case)
        files = [f for f in files if f.endswith(f".{extension}".replace("..", "."))]

        # Apply year-based filtering if regex pattern is available
        if regex:
            files = [f for f in files if regex.search(f)]

        # Convert filenames to full paths and add to result list
        file_list.extend([os.path.join(root, f) for f in files])

    # Sort files using natural numeric and string sorting
    file_list = sort_int_str(file_list, reverse=reverse)
    return file_list

transform_to_data_list ¶

transform_to_data_list(
    original_data,
    extension,
    reverse=False,
    is_standard_file_name=True,
    search_year_list=[],
    insert_flag=None,
    before_after="after",
)

Transform input data from various formats into a unified list of strings.

Supports multiple input types including directories, files, raw strings, and string lists, returning a consolidated list of text lines.

Parameters:

Name	Type	Description	Default
`original_data`	`list[str] \| str`	Input source - directory path, file path, multi-line string, or list of strings.	required
`extension`	`str`	Target file extension to filter when processing directories.	required
`reverse`	`bool`	Whether to reverse the order of files when reading from directory.	`False`
`is_standard_file_name`	`bool`	Whether to use standardized file name processing.	`True`
`search_year_list`	`list[str]`	Optional list of years to filter files by.	`[]`
`insert_flag`	`list[str] \| str \| None`	Content to insert between combined data chunks.	`None`
`before_after`	`str`	Insert position relative to existing content.	`'after'`

Returns:

Type	Description
`list[str]`	List[str]: Consolidated list of text lines from all processed sources.

Examples:

>>> transform_to_data_list("/path/to/files", "txt")
# Returns combined content from all .txt files in directory
>>> transform_to_data_list("line1\nline2", "txt")
['line1\n', 'line2\n']

Source code in pyadvtools/tools.py

def transform_to_data_list(
    original_data: list[str] | str,
    extension: str,
    reverse: bool = False,
    is_standard_file_name: bool = True,
    search_year_list: list[str] = [],
    insert_flag: list[str] | str | None = None,
    before_after: str = "after",
) -> list[str]:
    r"""Transform input data from various formats into a unified list of strings.

    Supports multiple input types including directories, files, raw strings,
    and string lists, returning a consolidated list of text lines.

    Args:
        original_data: Input source - directory path, file path, multi-line
                      string, or list of strings.
        extension: Target file extension to filter when processing directories.
        reverse: Whether to reverse the order of files when reading from directory.
        is_standard_file_name: Whether to use standardized file name processing.
        search_year_list: Optional list of years to filter files by.
        insert_flag: Content to insert between combined data chunks.
        before_after: Insert position relative to existing content.

    Returns:
        List[str]: Consolidated list of text lines from all processed sources.

    Examples:
        >>> transform_to_data_list("/path/to/files", "txt")
        # Returns combined content from all .txt files in directory
        >>> transform_to_data_list("line1\nline2", "txt")
        ['line1\n', 'line2\n']
    """
    # Handle string input (directory path, file path, or multi-line string)
    if isinstance(original_data, str):
        # Process directory input
        if os.path.isdir(original_data):
            # Get all files with target extension from directory
            files = iterate_obtain_full_file_names(
                standard_path(original_data), extension, reverse, is_standard_file_name, search_year_list
            )

            # Read all files and combine their contents
            data_list = combine_content_in_list([read_list(f, "r", None) for f in files], insert_flag, before_after)

        # Process file input (with matching extension or existing file)
        elif original_data.strip().endswith(extension) or os.path.isfile(original_data):
            # Read all lines from the file
            data_list = read_list(original_data, "r", None)

        # Process multi-line string input
        else:
            # Split string into lines while preserving line endings
            data_list = original_data.splitlines(keepends=True)

    # Handle list input (return directly)
    else:
        data_list = original_data
    return data_list