Source code for cocopack.overleaf

import os, io, re, shutil
import bibtexparser # 1.x

from copy import copy
from glob import glob
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
from datetime import datetime

__all__ = [
    'set_overleaf_root',
    'get_overleaf_root', 
    'get_overleaf_path', 
    'list_overleaf_projects',
    'gather_submission', 
    'find_tex_inputs', 
    'find_all_inputs', 
    'stitch_tex_files', 
    'get_bibtex_dir', 
    'get_bibtex_files', 
    'clean_bibtex_file', 
    'stitch_bibtex_files']

from .convert import convert_image

# Initial Setup -----------------------------------------------------------

[docs] def set_overleaf_root(overleaf_root=None): """Set the root directory for Overleaf projects. Args: overleaf_root (str, optional): Path to the Overleaf root directory. If None, prompts the user to enter the directory. Defaults to None. """ global OVERLEAF_ROOT if overleaf_root is not None: OVERLEAF_ROOT = overleaf_root else: # prompt user for root OVERLEAF_ROOT = input('Enter the Overleaf root directory: ')
def _check_overleaf_root(): overleaf_root_found = False overleaf_root_valid = False if 'OVERLEAF_ROOT' in globals(): overleaf_root_found = True if 'OVERLEAF_ROOT' in os.environ: overleaf_root_found = True overleaf_root_abspath = os.path.abspath(OVERLEAF_ROOT) if os.path.exists(overleaf_root_abspath): overleaf_root_valid = True return overleaf_root_found and overleaf_root_valid def _check_bibtexparser_version(): return bibtexparser.__version__.startswith('1') if not _check_bibtexparser_version(): raise ImportError("bibtexparser1.x, To fix, try:", "\npip install bibtexparser~=1.0") # Core Functions ----------------------------------------------------------
[docs] def get_overleaf_root(overleaf_root=None): """Get the root directory for Overleaf projects. Args: overleaf_root (str, optional): Path to the Overleaf root directory. If provided, returns this value. Defaults to None. Returns: str: Path to the Overleaf root directory. If not provided, tries to get it from globals, environment variables, or prompts the user. """ if overleaf_root is None: # fetch from globals if 'OVERLEAF_ROOT' in globals(): return globals().get('OVERLEAF_ROOT') if 'OVERLEAF_ROOT' in os.environ: return os.environ.get('OVERLEAF_ROOT') else: # prompt user for root set_overleaf_root() _check_overleaf_root() return overleaf_root
[docs] def get_overleaf_path(project_name, overleaf_root=None): """Get the full path to an Overleaf project. Args: project_name (str): Name of the Overleaf project. overleaf_root (str, optional): Path to the Overleaf root directory. If None, gets it from get_overleaf_root(). Defaults to None. Returns: str: Full path to the Overleaf project. """ overleaf_root = get_overleaf_root(overleaf_root) return os.path.join(overleaf_root, project_name)
[docs] def list_overleaf_projects(overleaf_root=None, exclusions=[], sort_by_date=True, **kwargs): """List all Overleaf projects in the root directory. Args: overleaf_root (str, optional): Path to the Overleaf root directory. If None, gets it from get_overleaf_root(). Defaults to None. exclusions (list, optional): List of strings to filter out projects containing these substrings. Defaults to an empty list. sort_by_date (bool, optional): Whether to sort projects by modification date. Defaults to True. **kwargs: Additional keyword arguments. verbose (bool): If True, prints projects with their last modified dates. Defaults to False. Returns: list: List of Overleaf project names. """ overleaf_root = get_overleaf_root(overleaf_root) # fetch root overleaf_paths = [(os.path.getmtime(path), path) for path in glob(os.path.join(overleaf_root, '*'))] project_list = [(date, os.path.basename(path)) for date, path in overleaf_paths if os.path.isdir(path)] if exclusions is not None and len(exclusions) > 0: project_list = [(date, project) for date, project in project_list if not any([ex in project for ex in exclusions])] if sort_by_date: # return projects in order of modification: project_list = sorted(project_list, reverse=True) if kwargs.pop('verbose', False): for date, project in project_list: date = (datetime.fromtimestamp(date) .strftime('%Y-%m-%d')) print(f'{project}: Last Modified {date}') return [project for date, project in project_list]
def get_overleaf_projects(overleaf_root=None, exclusions=[], sort_by_date=True, **kwargs): """Alias for list_overleaf_projects. Lists all Overleaf projects in the root directory. Args: overleaf_root (str, optional): Path to the Overleaf root directory. If None, gets it from get_overleaf_root(). Defaults to None. exclusions (list, optional): List of strings to filter out projects containing these substrings. Defaults to an empty list. sort_by_date (bool, optional): Whether to sort projects by modification date. Defaults to True. **kwargs: Additional keyword arguments. verbose (bool): If True, prints projects with their last modified dates. Defaults to False. Returns: list: List of Overleaf project names. """ return list_overleaf_projects(overleaf_root, exclusions, sort_by_date, **kwargs) # Gather Submission Materials ---------------------------------------------
[docs] def gather_submission(project_path, main_file, support_files, output_dir, **kwargs): """Gather LaTeX project files for submission, stitching files together and organizing references. Args: project_path (str): Path to the project root directory. main_file (str): Name of the main LaTeX file. support_files (list): List of supporting files to include (images, bibtex, etc.). output_dir (str): Directory where gathered submission will be saved. **kwargs: Additional keyword arguments. prepend_project (bool): If True, prepend project_path to output_dir. Defaults to False. fresh_start (bool): If True, clear the output directory if it exists. Defaults to True. main_name (str): Name for the output main file. Defaults to 'manuscript.tex'. new_names (dict): Map of original filenames to new filenames. Defaults to {}. image_format (str): Convert images to this format if specified. Defaults to None. verbose (bool): If True, print detailed information. Defaults to False. stitch_bibtex (bool): If True, stitch bibtex files together. Defaults to True. exclude_comments (bool): If True, exclude commented lines when updating references. Defaults to True. """ if kwargs.pop('prepend_project', False): output_dir = os.path.join(project_path, output_dir) output_root = Path(output_dir).parent # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) else: # clear the output directory if kwargs.get('fresh_start', True): if kwargs.get('verbose', True): print('Clearing the output directory:', output_dir) shutil.rmtree(output_dir) os.makedirs(output_dir, exist_ok=True) new_main = kwargs.pop('main_name', 'manuscript.tex') # Stitch (but don't yet write) main file .tex content content = stitch_tex_files(project_path, main_file, content_only=True, **kwargs) original_to_new = {} # file_path mappings #optional renaming schema for materials new_names = kwargs.pop('new_names', {}) image_extensions = Image.registered_extensions() image_format = kwargs.pop('image_format', None) # Copy files to the output directory, flattening the structure for file_path in support_files: original_dir, filename = os.path.split(file_path) filename = new_names.get(filename, filename) new_path = os.path.join(output_dir, filename) # Check if there's a file name clash and handle it if not os.path.exists(new_path): base, ext = os.path.splitext(filename) count = 1 # add as suffix to new_path while os.path.exists(new_path): new_filename = f"{base}_{str(count).zfill(2)}{ext}" new_path = os.path.join(output_dir, new_filename) count += 1 # iter-update the file count # Copy the file src_path = os.path.join(project_path, file_path) shutil.copyfile(src_path, new_path) original_to_new[file_path] = new_path image_files = [file_path for file_path in original_to_new if os.path.splitext(file_path)[1] in image_extensions] if image_format is not None: # convert images to target format description = f'Converting Images to {image_format.upper()}' for file_path in tqdm(image_files, desc=description): if file_path.endswith(image_format): continue new_path = original_to_new[file_path] convert_image(new_path, image_format) _, src_ext = os.path.splitext(new_path) new_path = new_path.replace(src_ext, f'.{image_format}') original_to_new[file_path] = new_path # update the mapping last_bibliography = r'\\bibliography\{references\}' # Update references in the content for old_path, new_path in original_to_new.items(): new_path = os.path.basename(new_path) # relative new_name, _ = os.path.splitext(new_path) old_name, _ = os.path.splitext(old_path) search_result = search_for_input(old_path, content, **kwargs) if search_result is not None: match_base = search_result['match_base'] context = search_result['in_command'] extension_included = '.' in match_base if extension_included: # update with path update = context.replace(old_path, new_path) else: # update with name only update = context.replace(old_name, new_name) if kwargs.get('exclude_comments', True): if context.startswith('%'): continue # skip commented lines new_string = new_path if '.' in match_base else new_name if kwargs.get('verbose', False): print(f"Updating {match_base} to {new_string}"+ f" in {context}:\n -> {update}") content = content.replace(context, update) if 'bibliography' in update: last_bibliography = copy(update) if kwargs.pop('stitch_bibtex', True): bibtex_files = get_bibtex_files(output_root, output_dir) output_file = os.path.join(output_dir, 'references.bib') if kwargs.get('verbose', False): print(f'Stitching {len(bibtex_files)} to {output_file}...') stitch_bibtex_files(project_path, bibtex_files, output_file, cleanup=True, dry_run=False) new_bibliography = "\\bibliography{references}" content = content.replace(last_bibliography, new_bibliography) if kwargs.get('verbose', True): print(f"Updating {last_bibliography} to {new_bibliography}") write_content(os.path.join(output_dir, new_main), content)
# Find Document Input ----------------------------------------------------- def get_command_regex(search, input_only=False): latex_commands = [r'\\input', r'\\usepackage', r'\\bibliography', r'\\includegraphics'] if input_only: latex_commands = [r'\\input'] command_pattern = f"({'|'.join(latex_commands)})" return (rf"(?:% *\s*)?{command_pattern}(?:\[[^\]]*\])?"+ rf"\{{.*?\b{re.escape(search)}(\.\w+)?\b.*?\}}") def search_for_input(file_path, content, **kwargs): base_name, extension = os.path.splitext(file_path) search_pattern = get_command_regex(base_name) matches = re.finditer(search_pattern, content) results = None # default to None all_results = [] # if multiple for match in matches: if kwargs.get('ignore_comments', True): if match.group(0).startswith('%'): continue # skip commented lines match_name = copy(base_name) if extension in match.group(0): match_name += extension results = {'match_base': match_name, 'in_command': match.group(0)} all_results += [results] if len(all_results) > 1: print('Warning: Multiple matches found for', file_path) return results # dictionary with match_context
[docs] def find_tex_inputs(project_dir, main_file='main.tex', depth=0, **kwargs): """Recursively find all LaTeX \input{} commands in a main file and its included files. Args: project_dir (str): Path to the project directory. main_file (str, optional): Name of the main LaTeX file. Defaults to 'main.tex'. depth (int, optional): Current recursion depth. Defaults to 0. **kwargs: Additional keyword arguments. max_depth (int): Maximum recursion depth. Defaults to 5. prepend_path (bool): If True, prepend the directory path to input files. Defaults to False. Returns: dict: Nested dictionary representing the structure of the LaTeX files and their inputs. """ max_depth = kwargs.get('max_depth', 5) if depth > max_depth: print(f"Warning: Maximum recursion depth reached at {main_file}. Stopping.") return {} file_path = os.path.join(project_dir, main_file) if not os.path.exists(file_path): print(f"Warning: File not found: {file_path}. Skipping...") return {} with open(file_path, 'r') as file: content = file.read() # initialize the structure at the main file structure = {main_file: {"path": file_path, "inputs": {}}} # Find all \input{} commands inputs = re.findall(r'(?:%\s*)?\\input\{(.+?)\}', content) for input_file in inputs: search_result = search_for_input(input_file, content, **kwargs) if search_result is None: continue # skip this file if not input_file.endswith('.tex'): input_file += '.tex' input_path = copy(input_file) if kwargs.get('prepend_path', False): input_path = os.path.join(os.path.dirname(file_path), input_file) # Recursively process the input file sub_structure = find_tex_inputs(project_dir, input_path, depth+1, **kwargs) structure[main_file]["inputs"].update(sub_structure) return structure
[docs] def find_all_inputs(project_path, main_file, stitch_first=False, **kwargs): """Find all files referenced in a LaTeX document through various commands. This function scans a LaTeX document for references to other files through commands like \input, \includegraphics, \bibliography, etc. Args: project_path (str): Path to the project directory. main_file (str): Name of the main LaTeX file. stitch_first (bool, optional): If True, stitch all input files before searching. Defaults to False. **kwargs: Additional keyword arguments. exclusions (list): List of strings to exclude files containing these substrings. files_only (bool): If True, return only file paths without match context. Defaults to False. Returns: Union[dict, list]: Either a dictionary mapping file paths to their match context, or a list of file paths if files_only=True. """ # List all non-hidden files recursively in the project path all_files = [] for root, dirs, files in os.walk(project_path): for file in files: full_path = os.path.join(root, file) relative_path = os.path.relpath(full_path, project_path) if not relative_path.startswith('.'): if not file == main_file: all_files.append(relative_path) main_filepath = os.path.join(project_path, main_file) if not stitch_first: # Read the contents of the main file content = read_content(main_filepath) else: # Stitch together all \inputs to main file first content = stitch_tex_files(project_path, main_file, content_only=True, **kwargs) results = {} # Dictionary to hold the results for relative_path in all_files: search_result = search_for_input(relative_path, content, **kwargs) if search_result is not None: results[relative_path] = search_result if kwargs.get('exclusions', None): def check_exclusion(entry): return any(exc in entry for exc in kwargs['exclusions']) results = {key: value for key, value in results.items() if not check_exclusion(key)} if kwargs.get('files_only', False): return list(results.keys()) # file_paths return results # dictionary with file paths and match context
# Stitch Tex Documents ---------------------------------------------------- def write_content(file_path, content): with open(file_path, 'w') as file: file.write(content) def read_content(file_path): with open(file_path, 'r') as file: content = file.read() return content # from document def update_paths(project_path, tex_file, updates, **kwargs): content = read_content(os.path.join(project_path, tex_file)) for previous, update in updates.items(): if kwargs.get('verbose', False): print(f"Updating {previous} to {update}") content = content.replace(previous, update) write_content(os.path.join(project_path, tex_file), content)
[docs] def stitch_tex_files(project_dir, main_file='main.tex', output_file=None, **kwargs): """Stitch together a LaTeX document by resolving all \input commands. Args: project_dir (str): Path to the project directory. main_file (str, optional): Name of the main LaTeX file. Defaults to 'main.tex'. output_file (str, optional): Path where the stitched file will be saved. If None, the function will only return the content. Defaults to None. **kwargs: Additional keyword arguments. exclude_with_comment (list): List of patterns to comment out instead of including. exclude (list): List of patterns to exclude from stitching. verbose (bool): If True, print detailed information. Defaults to False. content_only (bool): If True, only return the content without writing to a file. Defaults to True. Returns: str: The stitched LaTeX content. """ comment_exclude = kwargs.pop('exclude_with_comment', []) exclusions = kwargs.get('exclude', []) verbose = kwargs.get('verbose', False) def process_file(file_info): file_path = file_info['path'] with open(file_path, 'r') as file: content = file.read() # Replace \input{} commands for input_file, input_info in file_info['inputs'].items(): if verbose: print(f"Stitching \\input{{{input_file}}}") base_name, extension = os.path.splitext(input_file) if any([exc in input_file for exc in exclusions]): continue # skip rewriting of this file search_pattern = get_command_regex(base_name, True) if any([exc in search_pattern for exc in comment_exclude]): sub_args = (search_pattern, lambda x: f"%{x.group(0)}") content = re.sub(*sub_args, content); continue search_result = search_for_input(input_file, content, **kwargs) if search_result is None: continue # skip this file input_content = process_file(input_info) content = re.sub(search_pattern, lambda m: input_content, content) return content # Process the main file (first key in the structure) structure = find_tex_inputs(project_dir, main_file, **kwargs) main_file_info = structure[main_file] stitched_content = process_file(main_file_info) if kwargs.pop('content_only', True) or output_file is None: return stitched_content # return directly # Write the stitched content to the output file if '/' in output_file: # create new subdir os.makedirs(os.path.dirname(output_file), exist_ok=True) output_file = os.path.join(project_dir, output_file) with open(output_file, 'w') as file: file.write(stitched_content) print(f"Stitched file created: {output_file}")
# Manage Bibtex Files -----------------------------------------------------
[docs] def get_bibtex_dir(project_name, bibtex_dir='citation', **kwargs): """Get the path to the directory containing BibTeX files for a project. Args: project_name (str): Name of the Overleaf project. bibtex_dir (str, optional): Name of the directory containing BibTeX files. Defaults to 'citation'. **kwargs: Additional keyword arguments. overleaf_root (str): Path to the Overleaf root directory. Returns: str: Path to the BibTeX directory. """ overleaf_root = kwargs.pop('overleaf_root', None) overleaf_root = get_overleaf_root(overleaf_root) return os.path.join(overleaf_root, project_name, bibtex_dir)
[docs] def get_bibtex_files(project_path, bibtex_dir, other_dirs=[]): """Get a list of BibTeX files in the specified directories. Args: project_path (str): Path to the project root directory. bibtex_dir (str): Name of the primary directory containing BibTeX files. other_dirs (list, optional): List of additional directories to search for BibTeX files. Defaults to an empty list. Returns: list: List of relative paths to BibTeX files. """ # Process target bibtex directories + files: directories, bibtex_files = [bibtex_dir], [] if (other_dirs is not None and len(other_dirs) > 0): directories += [directory for directory in other_dirs] for directory in directories: search_string = f'{project_path}/{directory}' if directory is None: search_string = f'{project_path}' bibtex_files += glob(f'{search_string}/*.bib') # make all paths relative to project path bibtex_files = [os.path.relpath(file_path, project_path) for file_path in bibtex_files] return bibtex_files # from primary + other directories
[docs] def clean_bibtex_file(input_file_path, output_file_path=None): """Remove commented lines from a BibTeX file. Args: input_file_path (str): Path to the input BibTeX file. output_file_path (str, optional): Path where the cleaned file will be saved. If None, returns the cleaned content as a StringIO object. Defaults to None. Returns: io.StringIO: StringIO object containing the cleaned content if output_file_path is None, otherwise None. """ with open(input_file_path, 'r', encoding='utf-8') as file: lines = file.readlines() cleaned_content = [] for line in lines: stripped_line = line.strip() if not stripped_line.startswith('%'): cleaned_content.append(line) if output_file_path: with open(output_file_path, 'w', encoding='utf-8') as outfile: outfile.writelines(cleaned_content) else: # Return as StringIO if no output_file specified return io.StringIO(''.join(cleaned_content))
def parse_bibtex_file(bibtex_content, backend='bibtexparser'): """Parse BibTeX content using the specified backend. Args: bibtex_content (Union[str, io.StringIO]): BibTeX content as a string or StringIO object. backend (str, optional): Backend library to use for parsing. Options are 'bibtexparser' or 'pybtex'. Defaults to 'bibtexparser'. Returns: object: Parsed BibTeX database object (type depends on the backend used). Raises: ValueError: If the specified backend is not supported. """ from pybtex.database import parse_string import bibtexparser # assumed version 1.X if isinstance(bibtex_content, io.StringIO): bibtex_content.seek(0) # Ensure buffer is ready to read from the beginning bibtex_content = bibtex_content.read() if backend == 'bibtexparser': with io.StringIO(bibtex_content) as bibtex_file: bib_database = bibtexparser.load(bibtex_file) return bib_database elif backend == 'pybtex': bib_database = parse_string(bibtex_content, 'bibtex') return bib_database else: # raise error if backend not supported raise ValueError("Unsupported backend specified.") # Stitch Bibtex Files -----------------------------------------------------
[docs] def stitch_bibtex_files(project_path, bibtex_files, output_file, cleanup=False, dry_run=True, **kwargs): """Combine multiple BibTeX files into a single file, removing duplicates. Args: project_path (str): Path to the project root directory. bibtex_files (Union[str, list]): Either a directory containing BibTeX files or a list of BibTeX file paths. output_file (str): Path where the stitched file will be saved. cleanup (bool, optional): If True, delete or backup the original files. Defaults to False. dry_run (bool, optional): If True, don't write the stitched file or perform cleanup. Defaults to True. **kwargs: Additional keyword arguments. prepend_project (bool): If True, prepend project_path to output_file. Defaults to True. backup_dir (str): Directory where original files will be backed up, if cleanup is True. verbose (bool): If True, print detailed information. Defaults to False. Returns: None """ if kwargs.get('prepend_project', True): output_file = os.path.join(project_path, output_file) if isinstance(bibtex_files, str): if os.path.isdir(bibtex_files): if project_path[:1] not in bibtex_files: bibtex_files = os.path.join(project_path, bibtex_files) else: # assume list of files for index in range(len(bibtex_files)): bibtex_files[index] = os.path.join(project_path, bibtex_files[index]) if not dry_run: # Ensure the output directory exists os.makedirs(os.path.dirname(output_file), exist_ok=True) stitched_entries = {} files_to_process = [] if isinstance(bibtex_files, str): if os.path.isdir(bibtex_files): files_to_process = get_bibtex_files(project_path, bibtex_files) else: # assume list of files files_to_process = copy(bibtex_files) # Read all .bib files and accumulate unique entries for file_path in files_to_process: with open(file_path, 'r') as bibtex_file: bib_database = bibtexparser.load(bibtex_file) if kwargs.get('verbose', False): # of entries fetched print(f"{len(bib_database.entries)} entries fetched", f"from {os.path.basename(file_path)}") for entry in bib_database.entries: entry_id = entry.get('ID', None) if entry_id and entry_id not in bibtex_files: stitched_entries[entry_id] = entry if kwargs.get('verbose', False) or dry_run: # report number of unique entries print(f"{len(stitched_entries)} unique entries across", f"{len(files_to_process)} bibtex files") if not dry_run: # Write unique entries to output_file with open(output_file, 'w') as write_file: writer = bibtexparser.bwriter.BibTexWriter() db = bibtexparser.bibdatabase.BibDatabase() db.entries = list(stitched_entries.values()) write_file.write(writer.write(db)) # stitch print(f"Bibtex entries stitched to: {output_file}") else: # Report the output file name without writing print(f"Dry-Run: Entries stitched to {output_file}") if cleanup: # delete or move stitched files to backup timestamp = datetime.now().strftime("%Y-%m-%d") if kwargs.get('backup_dir', None) is not None: backup_dir = kwargs.get('backup_dir') output_dir = f'{Path(output_file).parent.parent}/{backup_dir}' backup_dir = f"{output_dir}/backup/{timestamp}" if not dry_run: # build the backup directory os.makedirs(backup_dir, exist_ok=True) for file_path in files_to_process: dst = os.path.join(backup_dir, os.path.basename(file_path)) shutil.move(file_path, dst) if kwargs.get('verbose', False): print(f"Moving {file_path} to {dst}") else: # report the move without actually moving for file_path in files_to_process: print(f"Would move {file_path} to {backup_dir}") else: # delete the stitched files for file_path in files_to_process: action_report = 'Would delete' if not dry_run: # delete file os.remove(file_path) action_report = 'Deleting' if kwargs.get('verbose', False): print(f"{action_report} {file_path}")