From 1feae235669fe1e52278406afea53428a035dfe6 Mon Sep 17 00:00:00 2001 From: Grant Sanderson Date: Thu, 29 Dec 2022 10:37:46 -0800 Subject: [PATCH] Improve num_tex_symbols --- manimlib/utils/tex.py | 68 ++--- manimlib/utils/tex_to_symbol_count.py | 359 +++++++++++++------------- 2 files changed, 205 insertions(+), 222 deletions(-) diff --git a/manimlib/utils/tex.py b/manimlib/utils/tex.py index e3ea53bb..1a70e429 100644 --- a/manimlib/utils/tex.py +++ b/manimlib/utils/tex.py @@ -1,57 +1,41 @@ from __future__ import annotations import re -from functools import lru_cache -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from typing import List, Tuple - - -@lru_cache(maxsize=1) -def get_pattern_symbol_count_pairs() -> List[Tuple[str, int]]: - from manimlib.utils.tex_to_symbol_count import TEX_TO_SYMBOL_COUNT - - # Gather all keys of previous map, grouped by common value - count_to_tex_list = dict() - for command, num in TEX_TO_SYMBOL_COUNT.items(): - if num not in count_to_tex_list: - count_to_tex_list[num] = [] - count_to_tex_list[num].append(command) - - # Create a list associating each count with a regular expression - # that will find any tex commands matching that list - pattern_symbol_count_pairs = list() - - # Account for patterns like \begin{align} and \phantom{thing} - # which, together with the bracketed content account for zero paths. - # Deliberately put this first in the list - tex_list = ["begin", "end", "phantom"] - pattern_symbol_count_pairs.append( - ("|".join(r"\\" + s + r"\{[^\\}]+\}" for s in tex_list), 0) - ) - - for count, tex_list in count_to_tex_list.items(): - pattern = "|".join(r"\\" + s for s in tex_list) - pattern_symbol_count_pairs.append((pattern, count)) - - # Assume all other expressions of the form \thing are drawn with one path - # Deliberately put this last in the list - pattern_symbol_count_pairs.append((r"\\[a-zA-Z]+", 1)) - - return pattern_symbol_count_pairs +from manimlib.utils.tex_to_symbol_count import TEX_TO_SYMBOL_COUNT def num_tex_symbols(tex: str) -> int: """ This function attempts to estimate the number of symbols that a given string of tex would produce. + + Warning, it may not behave perfectly """ + # First, remove patterns like \begin{align}, \phantom{thing}, + # \begin{array}{cc}, etc. + pattern = "|".join( + r"(\\" + s + ")" + r"(\{\w+\})?(\{\w+\})?(\[\w+\])?" + for s in ["begin", "end", "phantom"] + ) + for tup in re.findall(pattern, tex): + tex = tex.replace("".join(tup), " ") + + # Progressively count the symbols associated with certain tex commands, + # and remove those commands from the string, adding the number of symbols + # that command creates total = 0 - for pattern, count in get_pattern_symbol_count_pairs(): - total += count * len(re.findall(pattern, tex)) - tex = re.sub(pattern, " ", tex) # Remove that pattern + + # Start with the special case \sqrt[number] + for substr in re.findall(r"\\sqrt\[[0-9]+\]", tex): + total += len(substr) - 5 # e.g. \sqrt[3] is 3 symbols + tex = tex.replace(substr, " ") + + general_command = r"\\[a-zA-Z!,-/:;<>]+" + for substr in re.findall(general_command, tex): + total += TEX_TO_SYMBOL_COUNT.get(substr, 1) + tex = tex.replace(substr, " ") # Count remaining characters - total += sum(map(lambda c: c not in "^{} \n\t_$", tex)) + total += sum(map(lambda c: c not in "^{} \n\t_$\\&", tex)) return total diff --git a/manimlib/utils/tex_to_symbol_count.py b/manimlib/utils/tex_to_symbol_count.py index e858b572..c314b5dd 100644 --- a/manimlib/utils/tex_to_symbol_count.py +++ b/manimlib/utils/tex_to_symbol_count.py @@ -1,182 +1,181 @@ TEX_TO_SYMBOL_COUNT = { - "!": 0, - ",": 0, - ",": 0, - "-": 0, - "-": 0, - "/": 0, - ":": 0, - ";": 0, - ";": 0, - ">": 0, - "aa": 0, - "AA": 0, - "ae": 0, - "AE": 0, - "arccos": 6, - "arcsin": 6, - "arctan": 6, - "arg": 3, - "author": 0, - "bf": 0, - "bibliography": 0, - "bibliographystyle": 0, - "big": 0, - "Big": 0, - "bigodot": 4, - "bigoplus": 5, - "bigskip": 0, - "bmod": 3, - "boldmath": 0, - "bottomfraction": 2, - "bowtie": 2, - "cal": 0, - "cdots": 3, - "centering": 0, - "cite": 2, - "cong": 2, - "contentsline": 0, - "cos": 3, - "cosh": 4, - "cot": 3, - "coth": 4, - "csc": 3, - "date": 0, - "dblfloatpagefraction": 2, - "dbltopfraction": 2, - "ddots": 3, - "deg": 3, - "det": 3, - "dim": 3, - "displaystyle": 0, - "div": 2, - "doteq": 2, - "dotfill": 0, - "emph": 0, - "exp": 3, - "fbox": 4, - "floatpagefraction": 2, - "flushbottom": 0, - "footnotesize": 0, - "footnotetext": 0, - "frame": 2, - "framebox": 4, - "fussy": 0, - "gcd": 3, - "ghost": 0, - "glossary": 0, - "hfill": 0, - "hom": 3, - "hookleftarrow": 2, - "hookrightarrow": 2, - "hrulefill": 0, - "huge": 0, - "Huge": 0, - "hyphenation": 0, - "iff": 2, - "Im": 2, - "index": 0, - "inf": 3, - "it": 0, - "ker": 3, - "l": 0, - "L": 0, - "label": 0, - "large": 0, - "Large": 0, - "LARGE": 0, - "ldots": 3, - "lefteqn": 0, - "lg": 2, - "lim": 3, - "liminf": 6, - "limsup": 6, - "linebreak": 0, - "ln": 2, - "log": 3, - "longleftarrow": 2, - "Longleftarrow": 2, - "longleftrightarrow": 2, - "Longleftrightarrow": 2, - "longmapsto": 3, - "longrightarrow": 2, - "Longrightarrow": 2, - "makebox": 0, - "mapsto": 2, - "markright": 0, - "max": 3, - "mbox": 0, - "medskip": 0, - "min": 3, - "mit": 0, - "models": 2, - "ne": 2, - "neq": 2, - "newline": 0, - "noindent": 0, - "nolinebreak": 0, - "nonumber": 0, - "nopagebreak": 0, - "normalmarginpar": 0, - "normalsize": 0, - "notin": 2, - "o": 0, - "O": 0, - "obeycr": 0, - "oe": 0, - "OE": 0, - "overbrace": 4, - "pagebreak": 0, - "pagenumbering": 0, - "pageref": 2, - "pmod": 5, - "Pr": 2, - "protect": 0, - "qquad": 0, - "quad": 0, - "raggedbottom": 0, - "raggedleft": 0, - "raggedright": 0, - "Re": 2, - "ref": 2, - "restorecr": 0, - "reversemarginpar": 0, - "rm": 0, - "sc": 0, - "scriptscriptstyle": 0, - "scriptsize": 0, - "scriptstyle": 0, - "sec": 3, - "sf": 0, - "shortstack": 0, - "sin": 3, - "sinh": 4, - "sl": 0, - "sloppy": 0, - "small": 0, - "Small": 0, - "smallskip": 0, - "sqrt": 2, - "ss": 0, - "sup": 3, - "tan": 3, - "tanh": 4, - "textbf": 0, - "textfraction": 2, - "textstyle": 0, - "thicklines": 0, - "thinlines": 0, - "thinspace": 0, - "tiny": 0, - "title": 0, - "today": 15, - "topfraction": 2, - "tt": 0, - "typeout": 0, - "unboldmath": 0, - "underbrace": 6, - "underline": 0, - "value": 0, - "vdots": 3, - "vline": 0 + R"\!": 0, + R"\,": 0, + R"\-": 0, + R"\/": 0, + R"\:": 0, + R"\;": 0, + R"\>": 0, + R"\aa": 0, + R"\AA": 0, + R"\ae": 0, + R"\AE": 0, + R"\arccos": 6, + R"\arcsin": 6, + R"\arctan": 6, + R"\arg": 3, + R"\author": 0, + R"\bf": 0, + R"\bibliography": 0, + R"\bibliographystyle": 0, + R"\big": 0, + R"\Big": 0, + R"\bigodot": 4, + R"\bigoplus": 5, + R"\bigskip": 0, + R"\bmod": 3, + R"\boldmath": 0, + R"\bottomfraction": 2, + R"\bowtie": 2, + R"\cal": 0, + R"\cdots": 3, + R"\centering": 0, + R"\cite": 2, + R"\cong": 2, + R"\contentsline": 0, + R"\cos": 3, + R"\cosh": 4, + R"\cot": 3, + R"\coth": 4, + R"\csc": 3, + R"\date": 0, + R"\dblfloatpagefraction": 2, + R"\dbltopfraction": 2, + R"\ddots": 3, + R"\deg": 3, + R"\det": 3, + R"\dim": 3, + R"\displaystyle": 0, + R"\div": 2, + R"\doteq": 2, + R"\dotfill": 0, + R"\emph": 0, + R"\exp": 3, + R"\fbox": 4, + R"\floatpagefraction": 2, + R"\flushbottom": 0, + R"\footnotesize": 0, + R"\footnotetext": 0, + R"\frame": 2, + R"\framebox": 4, + R"\fussy": 0, + R"\gcd": 3, + R"\ghost": 0, + R"\glossary": 0, + R"\hfill": 0, + R"\hom": 3, + R"\hookleftarrow": 2, + R"\hookrightarrow": 2, + R"\hrulefill": 0, + R"\huge": 0, + R"\Huge": 0, + R"\hyphenation": 0, + R"\iff": 2, + R"\Im": 2, + R"\index": 0, + R"\inf": 3, + R"\it": 0, + R"\ker": 3, + R"\l": 0, + R"\L": 0, + R"\label": 0, + R"\large": 0, + R"\Large": 0, + R"\LARGE": 0, + R"\ldots": 3, + R"\lefteqn": 0, + R"\left": 0, + R"\lg": 2, + R"\lim": 3, + R"\liminf": 6, + R"\limsup": 6, + R"\linebreak": 0, + R"\ln": 2, + R"\log": 3, + R"\longleftarrow": 2, + R"\Longleftarrow": 2, + R"\longleftrightarrow": 2, + R"\Longleftrightarrow": 2, + R"\longmapsto": 3, + R"\longrightarrow": 2, + R"\Longrightarrow": 2, + R"\makebox": 0, + R"\mapsto": 2, + R"\markright": 0, + R"\max": 3, + R"\mbox": 0, + R"\medskip": 0, + R"\min": 3, + R"\mit": 0, + R"\models": 2, + R"\ne": 2, + R"\neq": 2, + R"\newline": 0, + R"\noindent": 0, + R"\nolinebreak": 0, + R"\nonumber": 0, + R"\nopagebreak": 0, + R"\normalmarginpar": 0, + R"\normalsize": 0, + R"\notin": 2, + R"\o": 0, + R"\O": 0, + R"\obeycr": 0, + R"\oe": 0, + R"\OE": 0, + R"\overbrace": 4, + R"\pagebreak": 0, + R"\pagenumbering": 0, + R"\pageref": 2, + R"\pmod": 5, + R"\Pr": 2, + R"\protect": 0, + R"\qquad": 0, + R"\quad": 0, + R"\raggedbottom": 0, + R"\raggedleft": 0, + R"\raggedright": 0, + R"\Re": 2, + R"\ref": 2, + R"\restorecr": 0, + R"\reversemarginpar": 0, + R"\right": 0, + R"\rm": 0, + R"\sc": 0, + R"\scriptscriptstyle": 0, + R"\scriptsize": 0, + R"\scriptstyle": 0, + R"\sec": 3, + R"\sf": 0, + R"\shortstack": 0, + R"\sin": 3, + R"\sinh": 4, + R"\sl": 0, + R"\sloppy": 0, + R"\small": 0, + R"\Small": 0, + R"\smallskip": 0, + R"\sqrt": 2, + R"\ss": 0, + R"\sup": 3, + R"\tan": 3, + R"\tanh": 4, + R"\textbf": 0, + R"\textfraction": 2, + R"\textstyle": 0, + R"\thicklines": 0, + R"\thinlines": 0, + R"\thinspace": 0, + R"\tiny": 0, + R"\title": 0, + R"\today": 15, + R"\topfraction": 2, + R"\tt": 0, + R"\typeout": 0, + R"\unboldmath": 0, + R"\underbrace": 6, + R"\underline": 0, + R"\value": 0, + R"\vdots": 3, + R"\vline": 0 } \ No newline at end of file