import os, io, re, shutil
import bibtexparser # 1.x
from copy import copy
from glob import glob
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
from datetime import datetime
__all__ = [
'set_overleaf_root',
'get_overleaf_root',
'get_overleaf_path',
'list_overleaf_projects',
'gather_submission',
'find_tex_inputs',
'find_all_inputs',
'stitch_tex_files',
'get_bibtex_dir',
'get_bibtex_files',
'clean_bibtex_file',
'stitch_bibtex_files']
from .convert import convert_image
# Initial Setup -----------------------------------------------------------
[docs]
def set_overleaf_root(overleaf_root=None):
"""Set the root directory for Overleaf projects.
Args:
overleaf_root (str, optional): Path to the Overleaf root directory.
If None, prompts the user to enter the directory. Defaults to None.
"""
global OVERLEAF_ROOT
if overleaf_root is not None:
OVERLEAF_ROOT = overleaf_root
else: # prompt user for root
OVERLEAF_ROOT = input('Enter the Overleaf root directory: ')
def _check_overleaf_root():
overleaf_root_found = False
overleaf_root_valid = False
if 'OVERLEAF_ROOT' in globals():
overleaf_root_found = True
if 'OVERLEAF_ROOT' in os.environ:
overleaf_root_found = True
overleaf_root_abspath = os.path.abspath(OVERLEAF_ROOT)
if os.path.exists(overleaf_root_abspath):
overleaf_root_valid = True
return overleaf_root_found and overleaf_root_valid
def _check_bibtexparser_version():
return bibtexparser.__version__.startswith('1')
if not _check_bibtexparser_version():
raise ImportError("bibtexparser1.x, To fix, try:",
"\npip install bibtexparser~=1.0")
# Core Functions ----------------------------------------------------------
[docs]
def get_overleaf_root(overleaf_root=None):
"""Get the root directory for Overleaf projects.
Args:
overleaf_root (str, optional): Path to the Overleaf root directory.
If provided, returns this value. Defaults to None.
Returns:
str: Path to the Overleaf root directory. If not provided, tries to get it from
globals, environment variables, or prompts the user.
"""
if overleaf_root is None: # fetch from globals
if 'OVERLEAF_ROOT' in globals():
return globals().get('OVERLEAF_ROOT')
if 'OVERLEAF_ROOT' in os.environ:
return os.environ.get('OVERLEAF_ROOT')
else: # prompt user for root
set_overleaf_root()
_check_overleaf_root()
return overleaf_root
[docs]
def get_overleaf_path(project_name, overleaf_root=None):
"""Get the full path to an Overleaf project.
Args:
project_name (str): Name of the Overleaf project.
overleaf_root (str, optional): Path to the Overleaf root directory.
If None, gets it from get_overleaf_root(). Defaults to None.
Returns:
str: Full path to the Overleaf project.
"""
overleaf_root = get_overleaf_root(overleaf_root)
return os.path.join(overleaf_root, project_name)
[docs]
def list_overleaf_projects(overleaf_root=None, exclusions=[], sort_by_date=True, **kwargs):
"""List all Overleaf projects in the root directory.
Args:
overleaf_root (str, optional): Path to the Overleaf root directory.
If None, gets it from get_overleaf_root(). Defaults to None.
exclusions (list, optional): List of strings to filter out projects containing these substrings.
Defaults to an empty list.
sort_by_date (bool, optional): Whether to sort projects by modification date. Defaults to True.
**kwargs: Additional keyword arguments.
verbose (bool): If True, prints projects with their last modified dates. Defaults to False.
Returns:
list: List of Overleaf project names.
"""
overleaf_root = get_overleaf_root(overleaf_root) # fetch root
overleaf_paths = [(os.path.getmtime(path), path) for path
in glob(os.path.join(overleaf_root, '*'))]
project_list = [(date, os.path.basename(path)) for date, path
in overleaf_paths if os.path.isdir(path)]
if exclusions is not None and len(exclusions) > 0:
project_list = [(date, project) for date, project in project_list
if not any([ex in project for ex in exclusions])]
if sort_by_date: # return projects in order of modification:
project_list = sorted(project_list, reverse=True)
if kwargs.pop('verbose', False):
for date, project in project_list:
date = (datetime.fromtimestamp(date)
.strftime('%Y-%m-%d'))
print(f'{project}: Last Modified {date}')
return [project for date, project in project_list]
def get_overleaf_projects(overleaf_root=None, exclusions=[], sort_by_date=True, **kwargs):
"""Alias for list_overleaf_projects. Lists all Overleaf projects in the root directory.
Args:
overleaf_root (str, optional): Path to the Overleaf root directory.
If None, gets it from get_overleaf_root(). Defaults to None.
exclusions (list, optional): List of strings to filter out projects containing these substrings.
Defaults to an empty list.
sort_by_date (bool, optional): Whether to sort projects by modification date. Defaults to True.
**kwargs: Additional keyword arguments.
verbose (bool): If True, prints projects with their last modified dates. Defaults to False.
Returns:
list: List of Overleaf project names.
"""
return list_overleaf_projects(overleaf_root, exclusions, sort_by_date, **kwargs)
# Gather Submission Materials ---------------------------------------------
[docs]
def gather_submission(project_path, main_file, support_files, output_dir, **kwargs):
"""Gather LaTeX project files for submission, stitching files together and organizing references.
Args:
project_path (str): Path to the project root directory.
main_file (str): Name of the main LaTeX file.
support_files (list): List of supporting files to include (images, bibtex, etc.).
output_dir (str): Directory where gathered submission will be saved.
**kwargs: Additional keyword arguments.
prepend_project (bool): If True, prepend project_path to output_dir. Defaults to False.
fresh_start (bool): If True, clear the output directory if it exists. Defaults to True.
main_name (str): Name for the output main file. Defaults to 'manuscript.tex'.
new_names (dict): Map of original filenames to new filenames. Defaults to {}.
image_format (str): Convert images to this format if specified. Defaults to None.
verbose (bool): If True, print detailed information. Defaults to False.
stitch_bibtex (bool): If True, stitch bibtex files together. Defaults to True.
exclude_comments (bool): If True, exclude commented lines when updating references. Defaults to True.
"""
if kwargs.pop('prepend_project', False):
output_dir = os.path.join(project_path, output_dir)
output_root = Path(output_dir).parent
# Ensure the output directory exists
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
else: # clear the output directory
if kwargs.get('fresh_start', True):
if kwargs.get('verbose', True):
print('Clearing the output directory:', output_dir)
shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)
new_main = kwargs.pop('main_name', 'manuscript.tex')
# Stitch (but don't yet write) main file .tex content
content = stitch_tex_files(project_path, main_file,
content_only=True, **kwargs)
original_to_new = {} # file_path mappings
#optional renaming schema for materials
new_names = kwargs.pop('new_names', {})
image_extensions = Image.registered_extensions()
image_format = kwargs.pop('image_format', None)
# Copy files to the output directory, flattening the structure
for file_path in support_files:
original_dir, filename = os.path.split(file_path)
filename = new_names.get(filename, filename)
new_path = os.path.join(output_dir, filename)
# Check if there's a file name clash and handle it
if not os.path.exists(new_path):
base, ext = os.path.splitext(filename)
count = 1 # add as suffix to new_path
while os.path.exists(new_path):
new_filename = f"{base}_{str(count).zfill(2)}{ext}"
new_path = os.path.join(output_dir, new_filename)
count += 1 # iter-update the file count
# Copy the file
src_path = os.path.join(project_path, file_path)
shutil.copyfile(src_path, new_path)
original_to_new[file_path] = new_path
image_files = [file_path for file_path in original_to_new if
os.path.splitext(file_path)[1] in image_extensions]
if image_format is not None: # convert images to target format
description = f'Converting Images to {image_format.upper()}'
for file_path in tqdm(image_files, desc=description):
if file_path.endswith(image_format): continue
new_path = original_to_new[file_path]
convert_image(new_path, image_format)
_, src_ext = os.path.splitext(new_path)
new_path = new_path.replace(src_ext, f'.{image_format}')
original_to_new[file_path] = new_path # update the mapping
last_bibliography = r'\\bibliography\{references\}'
# Update references in the content
for old_path, new_path in original_to_new.items():
new_path = os.path.basename(new_path) # relative
new_name, _ = os.path.splitext(new_path)
old_name, _ = os.path.splitext(old_path)
search_result = search_for_input(old_path, content, **kwargs)
if search_result is not None:
match_base = search_result['match_base']
context = search_result['in_command']
extension_included = '.' in match_base
if extension_included: # update with path
update = context.replace(old_path, new_path)
else: # update with name only
update = context.replace(old_name, new_name)
if kwargs.get('exclude_comments', True):
if context.startswith('%'):
continue # skip commented lines
new_string = new_path if '.' in match_base else new_name
if kwargs.get('verbose', False):
print(f"Updating {match_base} to {new_string}"+
f" in {context}:\n -> {update}")
content = content.replace(context, update)
if 'bibliography' in update:
last_bibliography = copy(update)
if kwargs.pop('stitch_bibtex', True):
bibtex_files = get_bibtex_files(output_root, output_dir)
output_file = os.path.join(output_dir, 'references.bib')
if kwargs.get('verbose', False):
print(f'Stitching {len(bibtex_files)} to {output_file}...')
stitch_bibtex_files(project_path, bibtex_files, output_file,
cleanup=True, dry_run=False)
new_bibliography = "\\bibliography{references}"
content = content.replace(last_bibliography, new_bibliography)
if kwargs.get('verbose', True):
print(f"Updating {last_bibliography} to {new_bibliography}")
write_content(os.path.join(output_dir, new_main), content)
# Find Document Input -----------------------------------------------------
def get_command_regex(search, input_only=False):
latex_commands = [r'\\input', r'\\usepackage', r'\\bibliography', r'\\includegraphics']
if input_only: latex_commands = [r'\\input']
command_pattern = f"({'|'.join(latex_commands)})"
return (rf"(?:% *\s*)?{command_pattern}(?:\[[^\]]*\])?"+
rf"\{{.*?\b{re.escape(search)}(\.\w+)?\b.*?\}}")
def search_for_input(file_path, content, **kwargs):
base_name, extension = os.path.splitext(file_path)
search_pattern = get_command_regex(base_name)
matches = re.finditer(search_pattern, content)
results = None # default to None
all_results = [] # if multiple
for match in matches:
if kwargs.get('ignore_comments', True):
if match.group(0).startswith('%'):
continue # skip commented lines
match_name = copy(base_name)
if extension in match.group(0):
match_name += extension
results = {'match_base': match_name,
'in_command': match.group(0)}
all_results += [results]
if len(all_results) > 1:
print('Warning: Multiple matches found for', file_path)
return results # dictionary with match_context
# Stitch Tex Documents ----------------------------------------------------
def write_content(file_path, content):
with open(file_path, 'w') as file:
file.write(content)
def read_content(file_path):
with open(file_path, 'r') as file:
content = file.read()
return content # from document
def update_paths(project_path, tex_file, updates, **kwargs):
content = read_content(os.path.join(project_path, tex_file))
for previous, update in updates.items():
if kwargs.get('verbose', False):
print(f"Updating {previous} to {update}")
content = content.replace(previous, update)
write_content(os.path.join(project_path, tex_file), content)
[docs]
def stitch_tex_files(project_dir, main_file='main.tex', output_file=None, **kwargs):
"""Stitch together a LaTeX document by resolving all \input commands.
Args:
project_dir (str): Path to the project directory.
main_file (str, optional): Name of the main LaTeX file. Defaults to 'main.tex'.
output_file (str, optional): Path where the stitched file will be saved.
If None, the function will only return the content. Defaults to None.
**kwargs: Additional keyword arguments.
exclude_with_comment (list): List of patterns to comment out instead of including.
exclude (list): List of patterns to exclude from stitching.
verbose (bool): If True, print detailed information. Defaults to False.
content_only (bool): If True, only return the content without writing to a file. Defaults to True.
Returns:
str: The stitched LaTeX content.
"""
comment_exclude = kwargs.pop('exclude_with_comment', [])
exclusions = kwargs.get('exclude', [])
verbose = kwargs.get('verbose', False)
def process_file(file_info):
file_path = file_info['path']
with open(file_path, 'r') as file:
content = file.read()
# Replace \input{} commands
for input_file, input_info in file_info['inputs'].items():
if verbose: print(f"Stitching \\input{{{input_file}}}")
base_name, extension = os.path.splitext(input_file)
if any([exc in input_file for exc in exclusions]):
continue # skip rewriting of this file
search_pattern = get_command_regex(base_name, True)
if any([exc in search_pattern for exc in comment_exclude]):
sub_args = (search_pattern, lambda x: f"%{x.group(0)}")
content = re.sub(*sub_args, content); continue
search_result = search_for_input(input_file, content, **kwargs)
if search_result is None:
continue # skip this file
input_content = process_file(input_info)
content = re.sub(search_pattern, lambda m: input_content, content)
return content
# Process the main file (first key in the structure)
structure = find_tex_inputs(project_dir, main_file, **kwargs)
main_file_info = structure[main_file]
stitched_content = process_file(main_file_info)
if kwargs.pop('content_only', True) or output_file is None:
return stitched_content # return directly
# Write the stitched content to the output file
if '/' in output_file: # create new subdir
os.makedirs(os.path.dirname(output_file), exist_ok=True)
output_file = os.path.join(project_dir, output_file)
with open(output_file, 'w') as file:
file.write(stitched_content)
print(f"Stitched file created: {output_file}")
# Manage Bibtex Files -----------------------------------------------------
[docs]
def get_bibtex_dir(project_name, bibtex_dir='citation', **kwargs):
"""Get the path to the directory containing BibTeX files for a project.
Args:
project_name (str): Name of the Overleaf project.
bibtex_dir (str, optional): Name of the directory containing BibTeX files.
Defaults to 'citation'.
**kwargs: Additional keyword arguments.
overleaf_root (str): Path to the Overleaf root directory.
Returns:
str: Path to the BibTeX directory.
"""
overleaf_root = kwargs.pop('overleaf_root', None)
overleaf_root = get_overleaf_root(overleaf_root)
return os.path.join(overleaf_root, project_name, bibtex_dir)
[docs]
def get_bibtex_files(project_path, bibtex_dir, other_dirs=[]):
"""Get a list of BibTeX files in the specified directories.
Args:
project_path (str): Path to the project root directory.
bibtex_dir (str): Name of the primary directory containing BibTeX files.
other_dirs (list, optional): List of additional directories to search for BibTeX files.
Defaults to an empty list.
Returns:
list: List of relative paths to BibTeX files.
"""
# Process target bibtex directories + files:
directories, bibtex_files = [bibtex_dir], []
if (other_dirs is not None and len(other_dirs) > 0):
directories += [directory for directory in other_dirs]
for directory in directories:
search_string = f'{project_path}/{directory}'
if directory is None:
search_string = f'{project_path}'
bibtex_files += glob(f'{search_string}/*.bib')
# make all paths relative to project path
bibtex_files = [os.path.relpath(file_path, project_path)
for file_path in bibtex_files]
return bibtex_files # from primary + other directories
[docs]
def clean_bibtex_file(input_file_path, output_file_path=None):
"""Remove commented lines from a BibTeX file.
Args:
input_file_path (str): Path to the input BibTeX file.
output_file_path (str, optional): Path where the cleaned file will be saved.
If None, returns the cleaned content as a StringIO object. Defaults to None.
Returns:
io.StringIO: StringIO object containing the cleaned content if output_file_path is None,
otherwise None.
"""
with open(input_file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
cleaned_content = []
for line in lines:
stripped_line = line.strip()
if not stripped_line.startswith('%'):
cleaned_content.append(line)
if output_file_path:
with open(output_file_path, 'w', encoding='utf-8') as outfile:
outfile.writelines(cleaned_content)
else: # Return as StringIO if no output_file specified
return io.StringIO(''.join(cleaned_content))
def parse_bibtex_file(bibtex_content, backend='bibtexparser'):
"""Parse BibTeX content using the specified backend.
Args:
bibtex_content (Union[str, io.StringIO]): BibTeX content as a string or StringIO object.
backend (str, optional): Backend library to use for parsing.
Options are 'bibtexparser' or 'pybtex'. Defaults to 'bibtexparser'.
Returns:
object: Parsed BibTeX database object (type depends on the backend used).
Raises:
ValueError: If the specified backend is not supported.
"""
from pybtex.database import parse_string
import bibtexparser # assumed version 1.X
if isinstance(bibtex_content, io.StringIO):
bibtex_content.seek(0) # Ensure buffer is ready to read from the beginning
bibtex_content = bibtex_content.read()
if backend == 'bibtexparser':
with io.StringIO(bibtex_content) as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
return bib_database
elif backend == 'pybtex':
bib_database = parse_string(bibtex_content, 'bibtex')
return bib_database
else: # raise error if backend not supported
raise ValueError("Unsupported backend specified.")
# Stitch Bibtex Files -----------------------------------------------------
[docs]
def stitch_bibtex_files(project_path, bibtex_files, output_file,
cleanup=False, dry_run=True, **kwargs):
"""Combine multiple BibTeX files into a single file, removing duplicates.
Args:
project_path (str): Path to the project root directory.
bibtex_files (Union[str, list]): Either a directory containing BibTeX files
or a list of BibTeX file paths.
output_file (str): Path where the stitched file will be saved.
cleanup (bool, optional): If True, delete or backup the original files. Defaults to False.
dry_run (bool, optional): If True, don't write the stitched file or perform cleanup.
Defaults to True.
**kwargs: Additional keyword arguments.
prepend_project (bool): If True, prepend project_path to output_file. Defaults to True.
backup_dir (str): Directory where original files will be backed up, if cleanup is True.
verbose (bool): If True, print detailed information. Defaults to False.
Returns:
None
"""
if kwargs.get('prepend_project', True):
output_file = os.path.join(project_path, output_file)
if isinstance(bibtex_files, str):
if os.path.isdir(bibtex_files):
if project_path[:1] not in bibtex_files:
bibtex_files = os.path.join(project_path, bibtex_files)
else: # assume list of files
for index in range(len(bibtex_files)):
bibtex_files[index] = os.path.join(project_path, bibtex_files[index])
if not dry_run: # Ensure the output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)
stitched_entries = {}
files_to_process = []
if isinstance(bibtex_files, str):
if os.path.isdir(bibtex_files):
files_to_process = get_bibtex_files(project_path, bibtex_files)
else: # assume list of files
files_to_process = copy(bibtex_files)
# Read all .bib files and accumulate unique entries
for file_path in files_to_process:
with open(file_path, 'r') as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
if kwargs.get('verbose', False): # of entries fetched
print(f"{len(bib_database.entries)} entries fetched",
f"from {os.path.basename(file_path)}")
for entry in bib_database.entries:
entry_id = entry.get('ID', None)
if entry_id and entry_id not in bibtex_files:
stitched_entries[entry_id] = entry
if kwargs.get('verbose', False) or dry_run:
# report number of unique entries
print(f"{len(stitched_entries)} unique entries across",
f"{len(files_to_process)} bibtex files")
if not dry_run: # Write unique entries to output_file
with open(output_file, 'w') as write_file:
writer = bibtexparser.bwriter.BibTexWriter()
db = bibtexparser.bibdatabase.BibDatabase()
db.entries = list(stitched_entries.values())
write_file.write(writer.write(db)) # stitch
print(f"Bibtex entries stitched to: {output_file}")
else: # Report the output file name without writing
print(f"Dry-Run: Entries stitched to {output_file}")
if cleanup: # delete or move stitched files to backup
timestamp = datetime.now().strftime("%Y-%m-%d")
if kwargs.get('backup_dir', None) is not None:
backup_dir = kwargs.get('backup_dir')
output_dir = f'{Path(output_file).parent.parent}/{backup_dir}'
backup_dir = f"{output_dir}/backup/{timestamp}"
if not dry_run: # build the backup directory
os.makedirs(backup_dir, exist_ok=True)
for file_path in files_to_process:
dst = os.path.join(backup_dir, os.path.basename(file_path))
shutil.move(file_path, dst)
if kwargs.get('verbose', False):
print(f"Moving {file_path} to {dst}")
else: # report the move without actually moving
for file_path in files_to_process:
print(f"Would move {file_path} to {backup_dir}")
else: # delete the stitched files
for file_path in files_to_process:
action_report = 'Would delete'
if not dry_run: # delete file
os.remove(file_path)
action_report = 'Deleting'
if kwargs.get('verbose', False):
print(f"{action_report} {file_path}")