mirror of
https://github.com/viq/NewsBlur.git
synced 2025-08-05 16:49:45 +00:00
525 lines
No EOL
14 KiB
Objective-C
525 lines
No EOL
14 KiB
Objective-C
//
|
|
// GTMNSString+HTML.m
|
|
// Dealing with NSStrings that contain HTML
|
|
//
|
|
// Copyright 2006-2008 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
// use this file except in compliance with the License. You may obtain a copy
|
|
// of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
// License for the specific language governing permissions and limitations under
|
|
// the License.
|
|
//
|
|
|
|
#import "GTMDefines.h"
|
|
#import "GTMNString+HTML.h"
|
|
|
|
typedef struct {
|
|
NSString *escapeSequence;
|
|
unichar uchar;
|
|
} HTMLEscapeMap;
|
|
|
|
// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
|
|
// Ordered by uchar lowest to highest for bsearching
|
|
static HTMLEscapeMap gAsciiHTMLEscapeMap[] = {
|
|
// A.2.2. Special characters
|
|
{ @""", 34 },
|
|
{ @"&", 38 },
|
|
{ @"'", 39 },
|
|
{ @"<", 60 },
|
|
{ @">", 62 },
|
|
|
|
// A.2.1. Latin-1 characters
|
|
{ @" ", 160 },
|
|
{ @"¡", 161 },
|
|
{ @"¢", 162 },
|
|
{ @"£", 163 },
|
|
{ @"¤", 164 },
|
|
{ @"¥", 165 },
|
|
{ @"¦", 166 },
|
|
{ @"§", 167 },
|
|
{ @"¨", 168 },
|
|
{ @"©", 169 },
|
|
{ @"ª", 170 },
|
|
{ @"«", 171 },
|
|
{ @"¬", 172 },
|
|
{ @"­", 173 },
|
|
{ @"®", 174 },
|
|
{ @"¯", 175 },
|
|
{ @"°", 176 },
|
|
{ @"±", 177 },
|
|
{ @"²", 178 },
|
|
{ @"³", 179 },
|
|
{ @"´", 180 },
|
|
{ @"µ", 181 },
|
|
{ @"¶", 182 },
|
|
{ @"·", 183 },
|
|
{ @"¸", 184 },
|
|
{ @"¹", 185 },
|
|
{ @"º", 186 },
|
|
{ @"»", 187 },
|
|
{ @"¼", 188 },
|
|
{ @"½", 189 },
|
|
{ @"¾", 190 },
|
|
{ @"¿", 191 },
|
|
{ @"À", 192 },
|
|
{ @"Á", 193 },
|
|
{ @"Â", 194 },
|
|
{ @"Ã", 195 },
|
|
{ @"Ä", 196 },
|
|
{ @"Å", 197 },
|
|
{ @"Æ", 198 },
|
|
{ @"Ç", 199 },
|
|
{ @"È", 200 },
|
|
{ @"É", 201 },
|
|
{ @"Ê", 202 },
|
|
{ @"Ë", 203 },
|
|
{ @"Ì", 204 },
|
|
{ @"Í", 205 },
|
|
{ @"Î", 206 },
|
|
{ @"Ï", 207 },
|
|
{ @"Ð", 208 },
|
|
{ @"Ñ", 209 },
|
|
{ @"Ò", 210 },
|
|
{ @"Ó", 211 },
|
|
{ @"Ô", 212 },
|
|
{ @"Õ", 213 },
|
|
{ @"Ö", 214 },
|
|
{ @"×", 215 },
|
|
{ @"Ø", 216 },
|
|
{ @"Ù", 217 },
|
|
{ @"Ú", 218 },
|
|
{ @"Û", 219 },
|
|
{ @"Ü", 220 },
|
|
{ @"Ý", 221 },
|
|
{ @"Þ", 222 },
|
|
{ @"ß", 223 },
|
|
{ @"à", 224 },
|
|
{ @"á", 225 },
|
|
{ @"â", 226 },
|
|
{ @"ã", 227 },
|
|
{ @"ä", 228 },
|
|
{ @"å", 229 },
|
|
{ @"æ", 230 },
|
|
{ @"ç", 231 },
|
|
{ @"è", 232 },
|
|
{ @"é", 233 },
|
|
{ @"ê", 234 },
|
|
{ @"ë", 235 },
|
|
{ @"ì", 236 },
|
|
{ @"í", 237 },
|
|
{ @"î", 238 },
|
|
{ @"ï", 239 },
|
|
{ @"ð", 240 },
|
|
{ @"ñ", 241 },
|
|
{ @"ò", 242 },
|
|
{ @"ó", 243 },
|
|
{ @"ô", 244 },
|
|
{ @"õ", 245 },
|
|
{ @"ö", 246 },
|
|
{ @"÷", 247 },
|
|
{ @"ø", 248 },
|
|
{ @"ù", 249 },
|
|
{ @"ú", 250 },
|
|
{ @"û", 251 },
|
|
{ @"ü", 252 },
|
|
{ @"ý", 253 },
|
|
{ @"þ", 254 },
|
|
{ @"ÿ", 255 },
|
|
|
|
// A.2.2. Special characters cont'd
|
|
{ @"Œ", 338 },
|
|
{ @"œ", 339 },
|
|
{ @"Š", 352 },
|
|
{ @"š", 353 },
|
|
{ @"Ÿ", 376 },
|
|
|
|
// A.2.3. Symbols
|
|
{ @"ƒ", 402 },
|
|
|
|
// A.2.2. Special characters cont'd
|
|
{ @"ˆ", 710 },
|
|
{ @"˜", 732 },
|
|
|
|
// A.2.3. Symbols cont'd
|
|
{ @"Α", 913 },
|
|
{ @"Β", 914 },
|
|
{ @"Γ", 915 },
|
|
{ @"Δ", 916 },
|
|
{ @"Ε", 917 },
|
|
{ @"Ζ", 918 },
|
|
{ @"Η", 919 },
|
|
{ @"Θ", 920 },
|
|
{ @"Ι", 921 },
|
|
{ @"Κ", 922 },
|
|
{ @"Λ", 923 },
|
|
{ @"Μ", 924 },
|
|
{ @"Ν", 925 },
|
|
{ @"Ξ", 926 },
|
|
{ @"Ο", 927 },
|
|
{ @"Π", 928 },
|
|
{ @"Ρ", 929 },
|
|
{ @"Σ", 931 },
|
|
{ @"Τ", 932 },
|
|
{ @"Υ", 933 },
|
|
{ @"Φ", 934 },
|
|
{ @"Χ", 935 },
|
|
{ @"Ψ", 936 },
|
|
{ @"Ω", 937 },
|
|
{ @"α", 945 },
|
|
{ @"β", 946 },
|
|
{ @"γ", 947 },
|
|
{ @"δ", 948 },
|
|
{ @"ε", 949 },
|
|
{ @"ζ", 950 },
|
|
{ @"η", 951 },
|
|
{ @"θ", 952 },
|
|
{ @"ι", 953 },
|
|
{ @"κ", 954 },
|
|
{ @"λ", 955 },
|
|
{ @"μ", 956 },
|
|
{ @"ν", 957 },
|
|
{ @"ξ", 958 },
|
|
{ @"ο", 959 },
|
|
{ @"π", 960 },
|
|
{ @"ρ", 961 },
|
|
{ @"ς", 962 },
|
|
{ @"σ", 963 },
|
|
{ @"τ", 964 },
|
|
{ @"υ", 965 },
|
|
{ @"φ", 966 },
|
|
{ @"χ", 967 },
|
|
{ @"ψ", 968 },
|
|
{ @"ω", 969 },
|
|
{ @"ϑ", 977 },
|
|
{ @"ϒ", 978 },
|
|
{ @"ϖ", 982 },
|
|
|
|
// A.2.2. Special characters cont'd
|
|
{ @" ", 8194 },
|
|
{ @" ", 8195 },
|
|
{ @" ", 8201 },
|
|
{ @"‌", 8204 },
|
|
{ @"‍", 8205 },
|
|
{ @"‎", 8206 },
|
|
{ @"‏", 8207 },
|
|
{ @"–", 8211 },
|
|
{ @"—", 8212 },
|
|
{ @"‘", 8216 },
|
|
{ @"’", 8217 },
|
|
{ @"‚", 8218 },
|
|
{ @"“", 8220 },
|
|
{ @"”", 8221 },
|
|
{ @"„", 8222 },
|
|
{ @"†", 8224 },
|
|
{ @"‡", 8225 },
|
|
// A.2.3. Symbols cont'd
|
|
{ @"•", 8226 },
|
|
{ @"…", 8230 },
|
|
|
|
// A.2.2. Special characters cont'd
|
|
{ @"‰", 8240 },
|
|
|
|
// A.2.3. Symbols cont'd
|
|
{ @"′", 8242 },
|
|
{ @"″", 8243 },
|
|
|
|
// A.2.2. Special characters cont'd
|
|
{ @"‹", 8249 },
|
|
{ @"›", 8250 },
|
|
|
|
// A.2.3. Symbols cont'd
|
|
{ @"‾", 8254 },
|
|
{ @"⁄", 8260 },
|
|
|
|
// A.2.2. Special characters cont'd
|
|
{ @"€", 8364 },
|
|
|
|
// A.2.3. Symbols cont'd
|
|
{ @"ℑ", 8465 },
|
|
{ @"℘", 8472 },
|
|
{ @"ℜ", 8476 },
|
|
{ @"™", 8482 },
|
|
{ @"ℵ", 8501 },
|
|
{ @"←", 8592 },
|
|
{ @"↑", 8593 },
|
|
{ @"→", 8594 },
|
|
{ @"↓", 8595 },
|
|
{ @"↔", 8596 },
|
|
{ @"↵", 8629 },
|
|
{ @"⇐", 8656 },
|
|
{ @"⇑", 8657 },
|
|
{ @"⇒", 8658 },
|
|
{ @"⇓", 8659 },
|
|
{ @"⇔", 8660 },
|
|
{ @"∀", 8704 },
|
|
{ @"∂", 8706 },
|
|
{ @"∃", 8707 },
|
|
{ @"∅", 8709 },
|
|
{ @"∇", 8711 },
|
|
{ @"∈", 8712 },
|
|
{ @"∉", 8713 },
|
|
{ @"∋", 8715 },
|
|
{ @"∏", 8719 },
|
|
{ @"∑", 8721 },
|
|
{ @"−", 8722 },
|
|
{ @"∗", 8727 },
|
|
{ @"√", 8730 },
|
|
{ @"∝", 8733 },
|
|
{ @"∞", 8734 },
|
|
{ @"∠", 8736 },
|
|
{ @"∧", 8743 },
|
|
{ @"∨", 8744 },
|
|
{ @"∩", 8745 },
|
|
{ @"∪", 8746 },
|
|
{ @"∫", 8747 },
|
|
{ @"∴", 8756 },
|
|
{ @"∼", 8764 },
|
|
{ @"≅", 8773 },
|
|
{ @"≈", 8776 },
|
|
{ @"≠", 8800 },
|
|
{ @"≡", 8801 },
|
|
{ @"≤", 8804 },
|
|
{ @"≥", 8805 },
|
|
{ @"⊂", 8834 },
|
|
{ @"⊃", 8835 },
|
|
{ @"⊄", 8836 },
|
|
{ @"⊆", 8838 },
|
|
{ @"⊇", 8839 },
|
|
{ @"⊕", 8853 },
|
|
{ @"⊗", 8855 },
|
|
{ @"⊥", 8869 },
|
|
{ @"⋅", 8901 },
|
|
{ @"⌈", 8968 },
|
|
{ @"⌉", 8969 },
|
|
{ @"⌊", 8970 },
|
|
{ @"⌋", 8971 },
|
|
{ @"⟨", 9001 },
|
|
{ @"⟩", 9002 },
|
|
{ @"◊", 9674 },
|
|
{ @"♠", 9824 },
|
|
{ @"♣", 9827 },
|
|
{ @"♥", 9829 },
|
|
{ @"♦", 9830 }
|
|
};
|
|
|
|
// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
|
|
// This is table A.2.2 Special Characters
|
|
static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = {
|
|
// C0 Controls and Basic Latin
|
|
{ @""", 34 },
|
|
{ @"&", 38 },
|
|
{ @"'", 39 },
|
|
{ @"<", 60 },
|
|
{ @">", 62 },
|
|
|
|
// Latin Extended-A
|
|
{ @"Œ", 338 },
|
|
{ @"œ", 339 },
|
|
{ @"Š", 352 },
|
|
{ @"š", 353 },
|
|
{ @"Ÿ", 376 },
|
|
|
|
// Spacing Modifier Letters
|
|
{ @"ˆ", 710 },
|
|
{ @"˜", 732 },
|
|
|
|
// General Punctuation
|
|
{ @" ", 8194 },
|
|
{ @" ", 8195 },
|
|
{ @" ", 8201 },
|
|
{ @"‌", 8204 },
|
|
{ @"‍", 8205 },
|
|
{ @"‎", 8206 },
|
|
{ @"‏", 8207 },
|
|
{ @"–", 8211 },
|
|
{ @"—", 8212 },
|
|
{ @"‘", 8216 },
|
|
{ @"’", 8217 },
|
|
{ @"‚", 8218 },
|
|
{ @"“", 8220 },
|
|
{ @"”", 8221 },
|
|
{ @"„", 8222 },
|
|
{ @"†", 8224 },
|
|
{ @"‡", 8225 },
|
|
{ @"‰", 8240 },
|
|
{ @"‹", 8249 },
|
|
{ @"›", 8250 },
|
|
{ @"€", 8364 },
|
|
};
|
|
|
|
|
|
// Utility function for Bsearching table above
|
|
static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) {
|
|
const unichar *uchar = (const unichar*)ucharVoid;
|
|
const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid;
|
|
int val;
|
|
if (*uchar > map->uchar) {
|
|
val = 1;
|
|
} else if (*uchar < map->uchar) {
|
|
val = -1;
|
|
} else {
|
|
val = 0;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
@implementation NSString (GTMNSStringHTMLAdditions)
|
|
|
|
- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table
|
|
ofSize:(NSUInteger)size
|
|
escapingUnicode:(BOOL)escapeUnicode {
|
|
NSUInteger length = [self length];
|
|
if (!length) {
|
|
return self;
|
|
}
|
|
|
|
NSMutableString *finalString = [NSMutableString string];
|
|
NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length];
|
|
|
|
// this block is common between GTMNSString+HTML and GTMNSString+XML but
|
|
// it's so short that it isn't really worth trying to share.
|
|
const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self);
|
|
if (!buffer) {
|
|
// We want this buffer to be autoreleased.
|
|
NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)];
|
|
if (!data) {
|
|
// COV_NF_START - Memory fail case
|
|
_GTMDevLog(@"couldn't alloc buffer");
|
|
return nil;
|
|
// COV_NF_END
|
|
}
|
|
[self getCharacters:[data mutableBytes]];
|
|
buffer = [data bytes];
|
|
}
|
|
|
|
if (!buffer || !data2) {
|
|
// COV_NF_START
|
|
_GTMDevLog(@"Unable to allocate buffer or data2");
|
|
return nil;
|
|
// COV_NF_END
|
|
}
|
|
|
|
unichar *buffer2 = (unichar *)[data2 mutableBytes];
|
|
|
|
NSUInteger buffer2Length = 0;
|
|
|
|
for (NSUInteger i = 0; i < length; ++i) {
|
|
HTMLEscapeMap *val = bsearch(&buffer[i], table,
|
|
size / sizeof(HTMLEscapeMap),
|
|
sizeof(HTMLEscapeMap), EscapeMapCompare);
|
|
if (val || (escapeUnicode && buffer[i] > 127)) {
|
|
if (buffer2Length) {
|
|
CFStringAppendCharacters((CFMutableStringRef)finalString,
|
|
buffer2,
|
|
buffer2Length);
|
|
buffer2Length = 0;
|
|
}
|
|
if (val) {
|
|
[finalString appendString:val->escapeSequence];
|
|
}
|
|
else {
|
|
_GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character");
|
|
[finalString appendFormat:@"&#%d;", buffer[i]];
|
|
}
|
|
} else {
|
|
buffer2[buffer2Length] = buffer[i];
|
|
buffer2Length += 1;
|
|
}
|
|
}
|
|
if (buffer2Length) {
|
|
CFStringAppendCharacters((CFMutableStringRef)finalString,
|
|
buffer2,
|
|
buffer2Length);
|
|
}
|
|
return finalString;
|
|
}
|
|
|
|
- (NSString *)gtm_stringByEscapingForHTML {
|
|
return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap
|
|
ofSize:sizeof(gUnicodeHTMLEscapeMap)
|
|
escapingUnicode:NO];
|
|
} // gtm_stringByEscapingHTML
|
|
|
|
- (NSString *)gtm_stringByEscapingForAsciiHTML {
|
|
return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap
|
|
ofSize:sizeof(gAsciiHTMLEscapeMap)
|
|
escapingUnicode:YES];
|
|
} // gtm_stringByEscapingAsciiHTML
|
|
|
|
- (NSString *)gtm_stringByUnescapingFromHTML {
|
|
NSRange range = NSMakeRange(0, [self length]);
|
|
NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range];
|
|
|
|
// if no ampersands, we've got a quick way out
|
|
if (subrange.length == 0) return self;
|
|
NSMutableString *finalString = [NSMutableString stringWithString:self];
|
|
do {
|
|
NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location);
|
|
semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange];
|
|
range = NSMakeRange(0, subrange.location);
|
|
// if we don't find a semicolon in the range, we don't have a sequence
|
|
if (semiColonRange.location == NSNotFound) {
|
|
continue;
|
|
}
|
|
NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1);
|
|
NSString *escapeString = [self substringWithRange:escapeRange];
|
|
NSUInteger length = [escapeString length];
|
|
// a squence must be longer than 3 (<) and less than 11 (ϑ)
|
|
if (length > 3 && length < 11) {
|
|
if ([escapeString characterAtIndex:1] == '#') {
|
|
unichar char2 = [escapeString characterAtIndex:2];
|
|
if (char2 == 'x' || char2 == 'X') {
|
|
// Hex escape squences £
|
|
NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)];
|
|
if (hexSequence) {
|
|
NSScanner *scanner = [NSScanner scannerWithString:hexSequence];
|
|
unsigned value;
|
|
if ([scanner scanHexInt:&value] &&
|
|
value < USHRT_MAX &&
|
|
value > 0
|
|
&& [scanner scanLocation] == length - 4) {
|
|
unichar uchar = value;
|
|
NSString *charString = [NSString stringWithCharacters:&uchar length:1];
|
|
[finalString replaceCharactersInRange:escapeRange withString:charString];
|
|
}
|
|
}
|
|
} else {
|
|
// Decimal Sequences {
|
|
NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)];
|
|
if (numberSequence) {
|
|
NSScanner *scanner = [NSScanner scannerWithString:numberSequence];
|
|
int value;
|
|
if ([scanner scanInt:&value] &&
|
|
value < USHRT_MAX &&
|
|
value > 0
|
|
&& [scanner scanLocation] == length - 3) {
|
|
unichar uchar = value;
|
|
NSString *charString = [NSString stringWithCharacters:&uchar length:1];
|
|
[finalString replaceCharactersInRange:escapeRange withString:charString];
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// "standard" sequences
|
|
for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) {
|
|
if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) {
|
|
[finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0);
|
|
return finalString;
|
|
} // gtm_stringByUnescapingHTML
|
|
|
|
|
|
|
|
@end |