2025-04-17 14:45:06 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# SPDX-License-Identifier: GPL-2.0
|
|
|
|
#
|
|
|
|
# Leverage Python's unicodedata module to generate ucs_width_table.h
|
|
|
|
|
|
|
|
import unicodedata
|
|
|
|
import sys
|
2025-04-17 14:45:16 -04:00
|
|
|
import argparse
|
2025-04-17 14:45:06 -04:00
|
|
|
|
|
|
|
# This script's file name
|
|
|
|
from pathlib import Path
|
|
|
|
this_file = Path(__file__).name
|
|
|
|
|
2025-04-17 14:45:16 -04:00
|
|
|
# Default output file name
|
|
|
|
DEFAULT_OUT_FILE = "ucs_width_table.h"
|
2025-04-17 14:45:06 -04:00
|
|
|
|
|
|
|
# --- Global Constants for Width Assignments ---
|
|
|
|
|
|
|
|
# Known zero-width characters
|
|
|
|
KNOWN_ZERO_WIDTH = (
|
|
|
|
0x200B, # ZERO WIDTH SPACE
|
|
|
|
0x200C, # ZERO WIDTH NON-JOINER
|
|
|
|
0x200D, # ZERO WIDTH JOINER
|
|
|
|
0x2060, # WORD JOINER
|
|
|
|
0xFEFF # ZERO WIDTH NO-BREAK SPACE (BOM)
|
|
|
|
)
|
|
|
|
|
|
|
|
# Zero-width emoji modifiers and components
|
|
|
|
# NOTE: Some of these characters would normally be single-width according to
|
|
|
|
# East Asian Width properties, but we deliberately override them to be
|
|
|
|
# zero-width because they function as modifiers in emoji sequences.
|
|
|
|
EMOJI_ZERO_WIDTH = [
|
|
|
|
# Skin tone modifiers
|
|
|
|
(0x1F3FB, 0x1F3FF), # Emoji modifiers (skin tones)
|
|
|
|
|
|
|
|
# Variation selectors (note: VS16 is treated specially in vt.c)
|
|
|
|
(0xFE00, 0xFE0F), # Variation Selectors 1-16
|
|
|
|
|
|
|
|
# Gender and hair style modifiers
|
|
|
|
# These would be single-width by Unicode properties, but are zero-width
|
|
|
|
# when part of emoji
|
|
|
|
(0x2640, 0x2640), # Female sign
|
|
|
|
(0x2642, 0x2642), # Male sign
|
|
|
|
(0x26A7, 0x26A7), # Transgender symbol
|
|
|
|
(0x1F9B0, 0x1F9B3), # Hair components (red, curly, white, bald)
|
|
|
|
|
|
|
|
# Tag characters
|
|
|
|
(0xE0020, 0xE007E), # Tags
|
|
|
|
]
|
|
|
|
|
|
|
|
# Regional indicators (flag components)
|
|
|
|
REGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF) # Regional indicator symbols A-Z
|
|
|
|
|
|
|
|
# Double-width emoji ranges
|
|
|
|
#
|
|
|
|
# Many emoji characters are classified as single-width according to Unicode
|
|
|
|
# Standard Annex #11 East Asian Width property (N or Neutral), but we
|
|
|
|
# deliberately override them to be double-width. References:
|
|
|
|
# 1. Unicode Technical Standard #51: Unicode Emoji
|
|
|
|
# (https://www.unicode.org/reports/tr51/)
|
|
|
|
# 2. Principle of "emoji presentation" in WHATWG CSS Text specification
|
|
|
|
# (https://drafts.csswg.org/css-text-3/#character-properties)
|
|
|
|
# 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which
|
|
|
|
# universally render emoji as double-width characters regardless of their
|
|
|
|
# Unicode EAW property
|
|
|
|
# 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1
|
|
|
|
# Emoji width (https://www.w3.org/TR/jlreq/)
|
|
|
|
EMOJI_RANGES = [
|
|
|
|
(0x1F000, 0x1F02F), # Mahjong Tiles (EAW: N, but displayed as double-width)
|
|
|
|
(0x1F0A0, 0x1F0FF), # Playing Cards (EAW: N, but displayed as double-width)
|
|
|
|
(0x1F300, 0x1F5FF), # Miscellaneous Symbols and Pictographs
|
|
|
|
(0x1F600, 0x1F64F), # Emoticons
|
|
|
|
(0x1F680, 0x1F6FF), # Transport and Map Symbols
|
|
|
|
(0x1F700, 0x1F77F), # Alchemical Symbols
|
|
|
|
(0x1F780, 0x1F7FF), # Geometric Shapes Extended
|
|
|
|
(0x1F800, 0x1F8FF), # Supplemental Arrows-C
|
|
|
|
(0x1F900, 0x1F9FF), # Supplemental Symbols and Pictographs
|
|
|
|
(0x1FA00, 0x1FA6F), # Chess Symbols
|
|
|
|
(0x1FA70, 0x1FAFF), # Symbols and Pictographs Extended-A
|
|
|
|
]
|
|
|
|
|
|
|
|
def create_width_tables():
|
|
|
|
"""
|
|
|
|
Creates Unicode character width tables and returns the data structures.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
tuple: (zero_width_ranges, double_width_ranges)
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Width data mapping
|
|
|
|
width_map = {} # Maps code points to width (0, 1, 2)
|
|
|
|
|
|
|
|
# Mark emoji modifiers as zero-width
|
|
|
|
for start, end in EMOJI_ZERO_WIDTH:
|
|
|
|
for cp in range(start, end + 1):
|
|
|
|
width_map[cp] = 0
|
|
|
|
|
|
|
|
# Mark all regional indicators as single-width as they are usually paired
|
|
|
|
# providing a combined width of 2 when displayed together.
|
|
|
|
start, end = REGIONAL_INDICATORS
|
|
|
|
for cp in range(start, end + 1):
|
|
|
|
width_map[cp] = 1
|
|
|
|
|
|
|
|
# Process all assigned Unicode code points (Basic Multilingual Plane +
|
|
|
|
# Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range)
|
|
|
|
for block_start in range(0, 0x110000, 0x1000):
|
|
|
|
block_end = block_start + 0x1000
|
|
|
|
for cp in range(block_start, block_end):
|
|
|
|
try:
|
|
|
|
char = chr(cp)
|
|
|
|
|
|
|
|
# Skip if already processed
|
|
|
|
if cp in width_map:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Check for combining marks and a format characters
|
|
|
|
category = unicodedata.category(char)
|
|
|
|
|
|
|
|
# Combining marks
|
|
|
|
if category.startswith('M'):
|
|
|
|
width_map[cp] = 0
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Format characters
|
|
|
|
# Since we have no support for bidirectional text, all format
|
|
|
|
# characters (category Cf) can be treated with width 0 (zero)
|
|
|
|
# for simplicity, as they don't need to occupy visual space
|
|
|
|
# in a non-bidirectional text environment.
|
|
|
|
if category == 'Cf':
|
|
|
|
width_map[cp] = 0
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Known zero-width characters
|
|
|
|
if cp in KNOWN_ZERO_WIDTH:
|
|
|
|
width_map[cp] = 0
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Use East Asian Width property
|
|
|
|
eaw = unicodedata.east_asian_width(char)
|
|
|
|
if eaw in ('F', 'W'): # Fullwidth or Wide
|
|
|
|
width_map[cp] = 2
|
|
|
|
elif eaw in ('Na', 'H', 'N', 'A'): # Narrow, Halfwidth, Neutral, Ambiguous
|
|
|
|
width_map[cp] = 1
|
|
|
|
else:
|
|
|
|
# Default to single-width for unknown
|
|
|
|
width_map[cp] = 1
|
|
|
|
|
|
|
|
except (ValueError, OverflowError):
|
|
|
|
# Skip invalid code points
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Process Emoji - generally double-width
|
|
|
|
for start, end in EMOJI_RANGES:
|
|
|
|
for cp in range(start, end + 1):
|
|
|
|
if cp not in width_map or width_map[cp] != 0: # Don't override zero-width
|
|
|
|
try:
|
|
|
|
char = chr(cp)
|
|
|
|
width_map[cp] = 2
|
|
|
|
except (ValueError, OverflowError):
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Optimize to create range tables
|
|
|
|
def ranges_optimize(width_data, target_width):
|
|
|
|
points = sorted([cp for cp, width in width_data.items() if width == target_width])
|
|
|
|
if not points:
|
|
|
|
return []
|
|
|
|
|
|
|
|
# Group consecutive code points into ranges
|
|
|
|
ranges = []
|
|
|
|
start = points[0]
|
|
|
|
prev = start
|
|
|
|
|
|
|
|
for cp in points[1:]:
|
|
|
|
if cp > prev + 1:
|
|
|
|
ranges.append((start, prev))
|
|
|
|
start = cp
|
|
|
|
prev = cp
|
|
|
|
|
|
|
|
# Add the last range
|
|
|
|
ranges.append((start, prev))
|
|
|
|
return ranges
|
|
|
|
|
|
|
|
# Extract ranges for each width
|
|
|
|
zero_width_ranges = ranges_optimize(width_map, 0)
|
|
|
|
double_width_ranges = ranges_optimize(width_map, 2)
|
|
|
|
|
|
|
|
return zero_width_ranges, double_width_ranges
|
|
|
|
|
2025-04-17 14:45:16 -04:00
|
|
|
def write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE):
|
2025-04-17 14:45:06 -04:00
|
|
|
"""
|
|
|
|
Write the generated tables to C header file.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
zero_width_ranges: List of (start, end) ranges for zero-width characters
|
|
|
|
double_width_ranges: List of (start, end) ranges for double-width characters
|
2025-04-17 14:45:16 -04:00
|
|
|
out_file: Output file name (default: DEFAULT_OUT_FILE)
|
2025-04-17 14:45:06 -04:00
|
|
|
"""
|
|
|
|
|
2025-04-17 14:45:14 -04:00
|
|
|
# Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
|
|
|
|
def split_ranges_by_size(ranges):
|
|
|
|
bmp_ranges = []
|
|
|
|
non_bmp_ranges = []
|
|
|
|
|
|
|
|
for start, end in ranges:
|
|
|
|
if end <= 0xFFFF:
|
|
|
|
bmp_ranges.append((start, end))
|
|
|
|
elif start > 0xFFFF:
|
|
|
|
non_bmp_ranges.append((start, end))
|
|
|
|
else:
|
|
|
|
# Split the range at 0xFFFF
|
|
|
|
bmp_ranges.append((start, 0xFFFF))
|
|
|
|
non_bmp_ranges.append((0x10000, end))
|
|
|
|
|
|
|
|
return bmp_ranges, non_bmp_ranges
|
|
|
|
|
|
|
|
# Split ranges into BMP and non-BMP
|
|
|
|
zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
|
|
|
|
double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)
|
|
|
|
|
2025-04-17 14:45:06 -04:00
|
|
|
# Function to generate code point description comments
|
|
|
|
def get_code_point_comment(start, end):
|
|
|
|
try:
|
|
|
|
start_char_desc = unicodedata.name(chr(start))
|
|
|
|
if start == end:
|
|
|
|
return f"/* {start_char_desc} */"
|
|
|
|
else:
|
|
|
|
end_char_desc = unicodedata.name(chr(end))
|
|
|
|
return f"/* {start_char_desc} - {end_char_desc} */"
|
|
|
|
except:
|
|
|
|
if start == end:
|
|
|
|
return f"/* U+{start:04X} */"
|
|
|
|
else:
|
|
|
|
return f"/* U+{start:04X} - U+{end:04X} */"
|
|
|
|
|
|
|
|
# Generate C tables
|
|
|
|
with open(out_file, 'w') as f:
|
|
|
|
f.write(f"""\
|
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
/*
|
|
|
|
* {out_file} - Unicode character width
|
|
|
|
*
|
|
|
|
* Auto-generated by {this_file}
|
|
|
|
*
|
|
|
|
* Unicode Version: {unicodedata.unidata_version}
|
|
|
|
*/
|
|
|
|
|
2025-04-17 14:45:14 -04:00
|
|
|
/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
|
|
|
|
static const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{
|
|
|
|
""")
|
|
|
|
|
|
|
|
for start, end in zero_width_bmp:
|
|
|
|
comment = get_code_point_comment(start, end)
|
|
|
|
f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
|
|
|
|
|
|
|
|
f.write("""\
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Zero-width character ranges (non-BMP, U+10000 and above) */
|
|
|
|
static const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = {
|
2025-04-17 14:45:06 -04:00
|
|
|
""")
|
|
|
|
|
2025-04-17 14:45:14 -04:00
|
|
|
for start, end in zero_width_non_bmp:
|
2025-04-17 14:45:06 -04:00
|
|
|
comment = get_code_point_comment(start, end)
|
|
|
|
f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
|
|
|
|
|
|
|
|
f.write("""\
|
|
|
|
};
|
|
|
|
|
2025-04-17 14:45:14 -04:00
|
|
|
/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
|
|
|
|
static const struct ucs_interval16 ucs_double_width_bmp_ranges[] = {
|
|
|
|
""")
|
|
|
|
|
|
|
|
for start, end in double_width_bmp:
|
|
|
|
comment = get_code_point_comment(start, end)
|
|
|
|
f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
|
|
|
|
|
|
|
|
f.write("""\
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Double-width character ranges (non-BMP, U+10000 and above) */
|
|
|
|
static const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = {
|
2025-04-17 14:45:06 -04:00
|
|
|
""")
|
|
|
|
|
2025-04-17 14:45:14 -04:00
|
|
|
for start, end in double_width_non_bmp:
|
2025-04-17 14:45:06 -04:00
|
|
|
comment = get_code_point_comment(start, end)
|
|
|
|
f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
|
|
|
|
|
|
|
|
f.write("};\n")
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-04-17 14:45:16 -04:00
|
|
|
# Parse command line arguments
|
|
|
|
parser = argparse.ArgumentParser(description="Generate Unicode width tables")
|
|
|
|
parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
|
|
|
|
help=f"Output file name (default: {DEFAULT_OUT_FILE})")
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2025-04-17 14:45:06 -04:00
|
|
|
# Write tables to header file
|
|
|
|
zero_width_ranges, double_width_ranges = create_width_tables()
|
2025-04-17 14:45:16 -04:00
|
|
|
write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file)
|
2025-04-17 14:45:06 -04:00
|
|
|
|
|
|
|
# Print summary
|
|
|
|
zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
|
|
|
|
double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
|
2025-04-17 14:45:16 -04:00
|
|
|
print(f"Generated {args.output_file} with:")
|
2025-04-17 14:45:06 -04:00
|
|
|
print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points")
|
|
|
|
print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points")
|
|
|
|
print(f"- Unicode Version: {unicodedata.unidata_version}")
|