2025-04-17 14:45:04 -04:00
|
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
/*
|
|
|
|
|
* ucs.c - Universal Character Set processing
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <linux/array_size.h>
|
|
|
|
|
#include <linux/bsearch.h>
|
|
|
|
|
#include <linux/consolemap.h>
|
|
|
|
|
#include <linux/minmax.h>
|
|
|
|
|
|
2025-04-17 14:45:15 -04:00
|
|
|
|
struct ucs_interval16 {
|
|
|
|
|
u16 first;
|
|
|
|
|
u16 last;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ucs_interval32 {
|
2025-04-17 14:45:04 -04:00
|
|
|
|
u32 first;
|
|
|
|
|
u32 last;
|
|
|
|
|
};
|
|
|
|
|
|
2025-04-17 14:45:08 -04:00
|
|
|
|
#include "ucs_width_table.h"
|
2025-04-17 14:45:04 -04:00
|
|
|
|
|
2025-04-17 14:45:15 -04:00
|
|
|
|
static int interval16_cmp(const void *key, const void *element)
|
|
|
|
|
{
|
|
|
|
|
u16 cp = *(u16 *)key;
|
|
|
|
|
const struct ucs_interval16 *entry = element;
|
|
|
|
|
|
|
|
|
|
if (cp < entry->first)
|
|
|
|
|
return -1;
|
|
|
|
|
if (cp > entry->last)
|
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int interval32_cmp(const void *key, const void *element)
|
2025-04-17 14:45:04 -04:00
|
|
|
|
{
|
|
|
|
|
u32 cp = *(u32 *)key;
|
2025-04-17 14:45:15 -04:00
|
|
|
|
const struct ucs_interval32 *entry = element;
|
2025-04-17 14:45:04 -04:00
|
|
|
|
|
|
|
|
|
if (cp < entry->first)
|
|
|
|
|
return -1;
|
|
|
|
|
if (cp > entry->last)
|
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-17 14:45:15 -04:00
|
|
|
|
static bool cp_in_range16(u16 cp, const struct ucs_interval16 *ranges, size_t size)
|
2025-04-17 14:45:08 -04:00
|
|
|
|
{
|
2025-05-07 10:13:16 -04:00
|
|
|
|
if (cp < ranges[0].first || cp > ranges[size - 1].last)
|
2025-04-17 14:45:08 -04:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
|
2025-04-17 14:45:15 -04:00
|
|
|
|
interval16_cmp) != NULL;
|
2025-04-17 14:45:08 -04:00
|
|
|
|
}
|
|
|
|
|
|
2025-04-17 14:45:15 -04:00
|
|
|
|
static bool cp_in_range32(u32 cp, const struct ucs_interval32 *ranges, size_t size)
|
|
|
|
|
{
|
2025-05-07 10:13:16 -04:00
|
|
|
|
if (cp < ranges[0].first || cp > ranges[size - 1].last)
|
2025-04-17 14:45:15 -04:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
|
|
|
|
|
interval32_cmp) != NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define UCS_IS_BMP(cp) ((cp) <= 0xffff)
|
|
|
|
|
|
2025-04-17 14:45:08 -04:00
|
|
|
|
/**
|
|
|
|
|
* ucs_is_zero_width() - Determine if a Unicode code point is zero-width.
|
|
|
|
|
* @cp: Unicode code point (UCS-4)
|
|
|
|
|
*
|
|
|
|
|
* Return: true if the character is zero-width, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
bool ucs_is_zero_width(u32 cp)
|
|
|
|
|
{
|
2025-04-17 14:45:15 -04:00
|
|
|
|
if (UCS_IS_BMP(cp))
|
|
|
|
|
return cp_in_range16(cp, ucs_zero_width_bmp_ranges,
|
|
|
|
|
ARRAY_SIZE(ucs_zero_width_bmp_ranges));
|
|
|
|
|
else
|
|
|
|
|
return cp_in_range32(cp, ucs_zero_width_non_bmp_ranges,
|
|
|
|
|
ARRAY_SIZE(ucs_zero_width_non_bmp_ranges));
|
2025-04-17 14:45:08 -04:00
|
|
|
|
}
|
|
|
|
|
|
2025-04-17 14:45:04 -04:00
|
|
|
|
/**
|
|
|
|
|
* ucs_is_double_width() - Determine if a Unicode code point is double-width.
|
|
|
|
|
* @cp: Unicode code point (UCS-4)
|
|
|
|
|
*
|
|
|
|
|
* Return: true if the character is double-width, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
bool ucs_is_double_width(u32 cp)
|
|
|
|
|
{
|
2025-04-17 14:45:15 -04:00
|
|
|
|
if (UCS_IS_BMP(cp))
|
|
|
|
|
return cp_in_range16(cp, ucs_double_width_bmp_ranges,
|
|
|
|
|
ARRAY_SIZE(ucs_double_width_bmp_ranges));
|
|
|
|
|
else
|
|
|
|
|
return cp_in_range32(cp, ucs_double_width_non_bmp_ranges,
|
|
|
|
|
ARRAY_SIZE(ucs_double_width_non_bmp_ranges));
|
2025-04-17 14:45:04 -04:00
|
|
|
|
}
|
2025-04-17 14:45:11 -04:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Structure for base with combining mark pairs and resulting recompositions.
|
|
|
|
|
* Using u16 to save space since all values are within BMP range.
|
|
|
|
|
*/
|
|
|
|
|
struct ucs_recomposition {
|
|
|
|
|
u16 base; /* base character */
|
|
|
|
|
u16 mark; /* combining mark */
|
|
|
|
|
u16 recomposed; /* corresponding recomposed character */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#include "ucs_recompose_table.h"
|
|
|
|
|
|
|
|
|
|
struct compare_key {
|
|
|
|
|
u16 base;
|
|
|
|
|
u16 mark;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static int recomposition_cmp(const void *key, const void *element)
|
|
|
|
|
{
|
|
|
|
|
const struct compare_key *search_key = key;
|
|
|
|
|
const struct ucs_recomposition *entry = element;
|
|
|
|
|
|
|
|
|
|
/* Compare base character first */
|
|
|
|
|
if (search_key->base < entry->base)
|
|
|
|
|
return -1;
|
|
|
|
|
if (search_key->base > entry->base)
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
/* Base characters match, now compare combining character */
|
|
|
|
|
if (search_key->mark < entry->mark)
|
|
|
|
|
return -1;
|
|
|
|
|
if (search_key->mark > entry->mark)
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
/* Both match */
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* ucs_recompose() - Attempt to recompose two Unicode characters into a single character.
|
|
|
|
|
* @base: Base Unicode code point (UCS-4)
|
|
|
|
|
* @mark: Combining mark Unicode code point (UCS-4)
|
|
|
|
|
*
|
|
|
|
|
* Return: Recomposed Unicode code point, or 0 if no recomposition is possible
|
|
|
|
|
*/
|
|
|
|
|
u32 ucs_recompose(u32 base, u32 mark)
|
|
|
|
|
{
|
|
|
|
|
/* Check if characters are within the range of our table */
|
2025-05-07 10:13:16 -04:00
|
|
|
|
if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE ||
|
|
|
|
|
mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK)
|
2025-04-17 14:45:11 -04:00
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
struct compare_key key = { base, mark };
|
|
|
|
|
struct ucs_recomposition *result =
|
|
|
|
|
__inline_bsearch(&key, ucs_recomposition_table,
|
|
|
|
|
ARRAY_SIZE(ucs_recomposition_table),
|
|
|
|
|
sizeof(*ucs_recomposition_table),
|
|
|
|
|
recomposition_cmp);
|
|
|
|
|
|
|
|
|
|
return result ? result->recomposed : 0;
|
|
|
|
|
}
|
2025-05-07 10:13:21 -04:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The fallback table structures implement a 2-level lookup.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
struct ucs_page_desc {
|
|
|
|
|
u8 page; /* Page index (high byte of code points) */
|
|
|
|
|
u8 count; /* Number of entries in this page */
|
|
|
|
|
u16 start; /* Start index in entries array */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ucs_page_entry {
|
|
|
|
|
u8 offset; /* Offset within page (0-255) */
|
|
|
|
|
u8 fallback; /* Fallback character or range start marker */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#include "ucs_fallback_table.h"
|
|
|
|
|
|
|
|
|
|
static int ucs_page_desc_cmp(const void *key, const void *element)
|
|
|
|
|
{
|
|
|
|
|
u8 page = *(u8 *)key;
|
|
|
|
|
const struct ucs_page_desc *entry = element;
|
|
|
|
|
|
|
|
|
|
if (page < entry->page)
|
|
|
|
|
return -1;
|
|
|
|
|
if (page > entry->page)
|
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int ucs_page_entry_cmp(const void *key, const void *element)
|
|
|
|
|
{
|
|
|
|
|
u8 offset = *(u8 *)key;
|
|
|
|
|
const struct ucs_page_entry *entry = element;
|
|
|
|
|
|
|
|
|
|
if (offset < entry->offset)
|
|
|
|
|
return -1;
|
|
|
|
|
if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) {
|
|
|
|
|
if (offset > entry[1].offset)
|
|
|
|
|
return 1;
|
|
|
|
|
} else {
|
|
|
|
|
if (offset > entry->offset)
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* ucs_get_fallback() - Get a substitution for the provided Unicode character
|
2025-06-10 19:02:29 -07:00
|
|
|
|
* @cp: Unicode code point (UCS-4)
|
2025-05-07 10:13:21 -04:00
|
|
|
|
*
|
|
|
|
|
* Get a simpler fallback character for the provided Unicode character.
|
|
|
|
|
* This is used for terminal display when corresponding glyph is unavailable.
|
|
|
|
|
* The substitution may not be as good as the actual glyph for the original
|
|
|
|
|
* character but still way more helpful than a squared question mark.
|
|
|
|
|
*
|
|
|
|
|
* Return: Fallback Unicode code point, or 0 if none is available
|
|
|
|
|
*/
|
|
|
|
|
u32 ucs_get_fallback(u32 cp)
|
|
|
|
|
{
|
|
|
|
|
const struct ucs_page_desc *page;
|
|
|
|
|
const struct ucs_page_entry *entry;
|
|
|
|
|
u8 page_idx = cp >> 8, offset = cp;
|
|
|
|
|
|
|
|
|
|
if (!UCS_IS_BMP(cp))
|
|
|
|
|
return 0;
|
|
|
|
|
|
2025-05-07 10:13:23 -04:00
|
|
|
|
/*
|
|
|
|
|
* Full-width to ASCII mapping (covering all printable ASCII 33-126)
|
|
|
|
|
* 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
|
|
|
|
|
* We process them programmatically to reduce the table size.
|
|
|
|
|
*/
|
|
|
|
|
if (cp >= 0xFF01 && cp <= 0xFF5E)
|
|
|
|
|
return cp - 0xFF01 + 33;
|
|
|
|
|
|
2025-05-07 10:13:21 -04:00
|
|
|
|
page = __inline_bsearch(&page_idx, ucs_fallback_pages,
|
|
|
|
|
ARRAY_SIZE(ucs_fallback_pages),
|
|
|
|
|
sizeof(*ucs_fallback_pages),
|
|
|
|
|
ucs_page_desc_cmp);
|
|
|
|
|
if (!page)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start,
|
|
|
|
|
page->count, sizeof(*ucs_fallback_entries),
|
|
|
|
|
ucs_page_entry_cmp);
|
|
|
|
|
if (!entry)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER)
|
|
|
|
|
entry++;
|
|
|
|
|
return entry->fallback;
|
|
|
|
|
}
|