Improve num_tex_symbols

This commit is contained in:
Grant Sanderson 2022-12-29 10:37:46 -08:00
parent 53f19b6620
commit 1feae23566
2 changed files with 205 additions and 222 deletions

View file

@ -1,57 +1,41 @@
from __future__ import annotations
import re
from functools import lru_cache
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import List, Tuple
@lru_cache(maxsize=1)
def get_pattern_symbol_count_pairs() -> List[Tuple[str, int]]:
from manimlib.utils.tex_to_symbol_count import TEX_TO_SYMBOL_COUNT
# Gather all keys of previous map, grouped by common value
count_to_tex_list = dict()
for command, num in TEX_TO_SYMBOL_COUNT.items():
if num not in count_to_tex_list:
count_to_tex_list[num] = []
count_to_tex_list[num].append(command)
# Create a list associating each count with a regular expression
# that will find any tex commands matching that list
pattern_symbol_count_pairs = list()
# Account for patterns like \begin{align} and \phantom{thing}
# which, together with the bracketed content account for zero paths.
# Deliberately put this first in the list
tex_list = ["begin", "end", "phantom"]
pattern_symbol_count_pairs.append(
("|".join(r"\\" + s + r"\{[^\\}]+\}" for s in tex_list), 0)
)
for count, tex_list in count_to_tex_list.items():
pattern = "|".join(r"\\" + s for s in tex_list)
pattern_symbol_count_pairs.append((pattern, count))
# Assume all other expressions of the form \thing are drawn with one path
# Deliberately put this last in the list
pattern_symbol_count_pairs.append((r"\\[a-zA-Z]+", 1))
return pattern_symbol_count_pairs
from manimlib.utils.tex_to_symbol_count import TEX_TO_SYMBOL_COUNT
def num_tex_symbols(tex: str) -> int:
"""
This function attempts to estimate the number of symbols that
a given string of tex would produce.
Warning, it may not behave perfectly
"""
# First, remove patterns like \begin{align}, \phantom{thing},
# \begin{array}{cc}, etc.
pattern = "|".join(
r"(\\" + s + ")" + r"(\{\w+\})?(\{\w+\})?(\[\w+\])?"
for s in ["begin", "end", "phantom"]
)
for tup in re.findall(pattern, tex):
tex = tex.replace("".join(tup), " ")
# Progressively count the symbols associated with certain tex commands,
# and remove those commands from the string, adding the number of symbols
# that command creates
total = 0
for pattern, count in get_pattern_symbol_count_pairs():
total += count * len(re.findall(pattern, tex))
tex = re.sub(pattern, " ", tex) # Remove that pattern
# Start with the special case \sqrt[number]
for substr in re.findall(r"\\sqrt\[[0-9]+\]", tex):
total += len(substr) - 5 # e.g. \sqrt[3] is 3 symbols
tex = tex.replace(substr, " ")
general_command = r"\\[a-zA-Z!,-/:;<>]+"
for substr in re.findall(general_command, tex):
total += TEX_TO_SYMBOL_COUNT.get(substr, 1)
tex = tex.replace(substr, " ")
# Count remaining characters
total += sum(map(lambda c: c not in "^{} \n\t_$", tex))
total += sum(map(lambda c: c not in "^{} \n\t_$\\&", tex))
return total

View file

@ -1,182 +1,181 @@
TEX_TO_SYMBOL_COUNT = {
"!": 0,
",": 0,
",": 0,
"-": 0,
"-": 0,
"/": 0,
":": 0,
";": 0,
";": 0,
">": 0,
"aa": 0,
"AA": 0,
"ae": 0,
"AE": 0,
"arccos": 6,
"arcsin": 6,
"arctan": 6,
"arg": 3,
"author": 0,
"bf": 0,
"bibliography": 0,
"bibliographystyle": 0,
"big": 0,
"Big": 0,
"bigodot": 4,
"bigoplus": 5,
"bigskip": 0,
"bmod": 3,
"boldmath": 0,
"bottomfraction": 2,
"bowtie": 2,
"cal": 0,
"cdots": 3,
"centering": 0,
"cite": 2,
"cong": 2,
"contentsline": 0,
"cos": 3,
"cosh": 4,
"cot": 3,
"coth": 4,
"csc": 3,
"date": 0,
"dblfloatpagefraction": 2,
"dbltopfraction": 2,
"ddots": 3,
"deg": 3,
"det": 3,
"dim": 3,
"displaystyle": 0,
"div": 2,
"doteq": 2,
"dotfill": 0,
"emph": 0,
"exp": 3,
"fbox": 4,
"floatpagefraction": 2,
"flushbottom": 0,
"footnotesize": 0,
"footnotetext": 0,
"frame": 2,
"framebox": 4,
"fussy": 0,
"gcd": 3,
"ghost": 0,
"glossary": 0,
"hfill": 0,
"hom": 3,
"hookleftarrow": 2,
"hookrightarrow": 2,
"hrulefill": 0,
"huge": 0,
"Huge": 0,
"hyphenation": 0,
"iff": 2,
"Im": 2,
"index": 0,
"inf": 3,
"it": 0,
"ker": 3,
"l": 0,
"L": 0,
"label": 0,
"large": 0,
"Large": 0,
"LARGE": 0,
"ldots": 3,
"lefteqn": 0,
"lg": 2,
"lim": 3,
"liminf": 6,
"limsup": 6,
"linebreak": 0,
"ln": 2,
"log": 3,
"longleftarrow": 2,
"Longleftarrow": 2,
"longleftrightarrow": 2,
"Longleftrightarrow": 2,
"longmapsto": 3,
"longrightarrow": 2,
"Longrightarrow": 2,
"makebox": 0,
"mapsto": 2,
"markright": 0,
"max": 3,
"mbox": 0,
"medskip": 0,
"min": 3,
"mit": 0,
"models": 2,
"ne": 2,
"neq": 2,
"newline": 0,
"noindent": 0,
"nolinebreak": 0,
"nonumber": 0,
"nopagebreak": 0,
"normalmarginpar": 0,
"normalsize": 0,
"notin": 2,
"o": 0,
"O": 0,
"obeycr": 0,
"oe": 0,
"OE": 0,
"overbrace": 4,
"pagebreak": 0,
"pagenumbering": 0,
"pageref": 2,
"pmod": 5,
"Pr": 2,
"protect": 0,
"qquad": 0,
"quad": 0,
"raggedbottom": 0,
"raggedleft": 0,
"raggedright": 0,
"Re": 2,
"ref": 2,
"restorecr": 0,
"reversemarginpar": 0,
"rm": 0,
"sc": 0,
"scriptscriptstyle": 0,
"scriptsize": 0,
"scriptstyle": 0,
"sec": 3,
"sf": 0,
"shortstack": 0,
"sin": 3,
"sinh": 4,
"sl": 0,
"sloppy": 0,
"small": 0,
"Small": 0,
"smallskip": 0,
"sqrt": 2,
"ss": 0,
"sup": 3,
"tan": 3,
"tanh": 4,
"textbf": 0,
"textfraction": 2,
"textstyle": 0,
"thicklines": 0,
"thinlines": 0,
"thinspace": 0,
"tiny": 0,
"title": 0,
"today": 15,
"topfraction": 2,
"tt": 0,
"typeout": 0,
"unboldmath": 0,
"underbrace": 6,
"underline": 0,
"value": 0,
"vdots": 3,
"vline": 0
R"\!": 0,
R"\,": 0,
R"\-": 0,
R"\/": 0,
R"\:": 0,
R"\;": 0,
R"\>": 0,
R"\aa": 0,
R"\AA": 0,
R"\ae": 0,
R"\AE": 0,
R"\arccos": 6,
R"\arcsin": 6,
R"\arctan": 6,
R"\arg": 3,
R"\author": 0,
R"\bf": 0,
R"\bibliography": 0,
R"\bibliographystyle": 0,
R"\big": 0,
R"\Big": 0,
R"\bigodot": 4,
R"\bigoplus": 5,
R"\bigskip": 0,
R"\bmod": 3,
R"\boldmath": 0,
R"\bottomfraction": 2,
R"\bowtie": 2,
R"\cal": 0,
R"\cdots": 3,
R"\centering": 0,
R"\cite": 2,
R"\cong": 2,
R"\contentsline": 0,
R"\cos": 3,
R"\cosh": 4,
R"\cot": 3,
R"\coth": 4,
R"\csc": 3,
R"\date": 0,
R"\dblfloatpagefraction": 2,
R"\dbltopfraction": 2,
R"\ddots": 3,
R"\deg": 3,
R"\det": 3,
R"\dim": 3,
R"\displaystyle": 0,
R"\div": 2,
R"\doteq": 2,
R"\dotfill": 0,
R"\emph": 0,
R"\exp": 3,
R"\fbox": 4,
R"\floatpagefraction": 2,
R"\flushbottom": 0,
R"\footnotesize": 0,
R"\footnotetext": 0,
R"\frame": 2,
R"\framebox": 4,
R"\fussy": 0,
R"\gcd": 3,
R"\ghost": 0,
R"\glossary": 0,
R"\hfill": 0,
R"\hom": 3,
R"\hookleftarrow": 2,
R"\hookrightarrow": 2,
R"\hrulefill": 0,
R"\huge": 0,
R"\Huge": 0,
R"\hyphenation": 0,
R"\iff": 2,
R"\Im": 2,
R"\index": 0,
R"\inf": 3,
R"\it": 0,
R"\ker": 3,
R"\l": 0,
R"\L": 0,
R"\label": 0,
R"\large": 0,
R"\Large": 0,
R"\LARGE": 0,
R"\ldots": 3,
R"\lefteqn": 0,
R"\left": 0,
R"\lg": 2,
R"\lim": 3,
R"\liminf": 6,
R"\limsup": 6,
R"\linebreak": 0,
R"\ln": 2,
R"\log": 3,
R"\longleftarrow": 2,
R"\Longleftarrow": 2,
R"\longleftrightarrow": 2,
R"\Longleftrightarrow": 2,
R"\longmapsto": 3,
R"\longrightarrow": 2,
R"\Longrightarrow": 2,
R"\makebox": 0,
R"\mapsto": 2,
R"\markright": 0,
R"\max": 3,
R"\mbox": 0,
R"\medskip": 0,
R"\min": 3,
R"\mit": 0,
R"\models": 2,
R"\ne": 2,
R"\neq": 2,
R"\newline": 0,
R"\noindent": 0,
R"\nolinebreak": 0,
R"\nonumber": 0,
R"\nopagebreak": 0,
R"\normalmarginpar": 0,
R"\normalsize": 0,
R"\notin": 2,
R"\o": 0,
R"\O": 0,
R"\obeycr": 0,
R"\oe": 0,
R"\OE": 0,
R"\overbrace": 4,
R"\pagebreak": 0,
R"\pagenumbering": 0,
R"\pageref": 2,
R"\pmod": 5,
R"\Pr": 2,
R"\protect": 0,
R"\qquad": 0,
R"\quad": 0,
R"\raggedbottom": 0,
R"\raggedleft": 0,
R"\raggedright": 0,
R"\Re": 2,
R"\ref": 2,
R"\restorecr": 0,
R"\reversemarginpar": 0,
R"\right": 0,
R"\rm": 0,
R"\sc": 0,
R"\scriptscriptstyle": 0,
R"\scriptsize": 0,
R"\scriptstyle": 0,
R"\sec": 3,
R"\sf": 0,
R"\shortstack": 0,
R"\sin": 3,
R"\sinh": 4,
R"\sl": 0,
R"\sloppy": 0,
R"\small": 0,
R"\Small": 0,
R"\smallskip": 0,
R"\sqrt": 2,
R"\ss": 0,
R"\sup": 3,
R"\tan": 3,
R"\tanh": 4,
R"\textbf": 0,
R"\textfraction": 2,
R"\textstyle": 0,
R"\thicklines": 0,
R"\thinlines": 0,
R"\thinspace": 0,
R"\tiny": 0,
R"\title": 0,
R"\today": 15,
R"\topfraction": 2,
R"\tt": 0,
R"\typeout": 0,
R"\unboldmath": 0,
R"\underbrace": 6,
R"\underline": 0,
R"\value": 0,
R"\vdots": 3,
R"\vline": 0
}