Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch wip Excluding Merge-Ins
This is equivalent to a diff from fa8d38c58f to ed9de65f55
2024-02-26
| ||
00:37 | wip Leaf check-in: ed9de65f55 user: rolf tags: wip | |
2024-02-24
| ||
02:46 | wip check-in: 69c8348852 user: rolf tags: wip | |
2024-02-23
| ||
01:36 | Updated recognized HTML entities. Leaf check-in: fa8d38c58f user: rolf tags: HTML5Entities | |
01:18 | Merged from trunk. check-in: c4776de175 user: rolf tags: HTML5Entities | |
Changes to generic/domhtml.c.
︙ | ︙ | |||
2301 2302 2303 2304 2305 2306 2307 | } /* ErInit */ /*---------------------------------------------------------------------------- | TranslateEntityRefs -- | | | > > > | > > < | > | > | > > > > > > > | 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 | } /* ErInit */ /*---------------------------------------------------------------------------- | TranslateEntityRefs -- | | Translate entity references and character references in the | nodeValue of the domTextNode or domAttrNode given as | argument. Since all character references and almost all | entity references are longer or equal in length as the | referenced UTF-8 byte sequence the translation is basically | done by rewriting the nodeValue in place and with special | handling of the few conterexamples. | | Unrecognized entity references are unaltered. | | Example: | | input = "AT&T > MCI" | output = "AT&T > MCI" | \---------------------------------------------------------------------------*/ static void TranslateEntityRefs ( domTextNode *textOrAtt ) { char *z; /* Pointer to nodeValue to rewrite */ int from; /* Read characters from this position in z[] */ int to; /* Write characters into this position in z[] */ int h; /* A hash on the entity reference */ char *zVal; /* The substituted value */ Er *p; /* For looping down the entity reference collision chain */ int value, zlen, overlen; char *ole, *newNodeValue; if (textOrAtt->nodeType == ATTRIBUTE_NODE) { z = ((domAttrNode*)textOrAtt)->nodeValue; zlen = ((domAttrNode*)textOrAtt)->valueLength; } else { z = textOrAtt->nodeValue; zlen = textOrAtt->valueLength; } from = to = 0; if (bErNeedsInit) { TDomThreaded(Tcl_MutexLock(&initMutex);) if (bErNeedsInit) { ErInit(); bErNeedsInit = 0; |
︙ | ︙ | |||
2414 2415 2416 2417 2418 2419 2420 | z[to++] = (char) ((value >> 6) | 0xC0); z[to++] = (char) ((value | 0x80) & 0xBF); } else if (value <= 0xFFFF) { z[to++] = (char) ((value >> 12) | 0xE0); z[to++] = (char) (((value >> 6) | 0x80) & 0xBF); z[to++] = (char) ((value | 0x80) & 0xBF); } else { | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > | > | 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 | z[to++] = (char) ((value >> 6) | 0xC0); z[to++] = (char) ((value | 0x80) & 0xBF); } else if (value <= 0xFFFF) { z[to++] = (char) ((value >> 12) | 0xE0); z[to++] = (char) (((value >> 6) | 0x80) & 0xBF); z[to++] = (char) ((value | 0x80) & 0xBF); } else { z[to++] = (char) ((value >> 18) | 0xf0); z[to++] = (char) (((value >> 12) & 0x3f) | 0x80); z[to++] = (char) (((value >> 6) & 0x3f) | 0x80); z[to++] = (char) ((value & 0x3f) | 0x80); } from = i+1; } } else { while (z[i] && isalnum((unsigned char)z[i])) { i++; } c = z[i]; z[i] = 0; h = ErHash(&z[from+1]); p = apErHash[h]; while (p && strcmp(p->zName,&z[from+1])!=0 ) { p = p->pNext; } ole = NULL; if (!p && c == ';') { /* Entity name not found. It may be one of the few * entities with a referenced UTF-8 byte sequence * which is longer than the reference. */ if (strcmp("nGt",&z[from+1]) == 0) { ole = "\xE2\x89\xAB\xE2\x83\x92\x00"; overlen = 1; } else if (strcmp("nLt",&z[from+1]) == 0) { ole = "\xE2\x89\xAA\xE2\x83\x92\x00"; overlen = 1; } } z[i] = c; if (p) { zVal = p->zValue; while (*zVal) { z[to++] = *(zVal++); } from = i; if (c==';') from++; } else { if (ole) { /* Over-long entity reference */ from = i; newNodeValue = MALLOC(zlen + 1 + overlen); memmove(newNodeValue,z,to); while (*ole) { newNodeValue[to++] = *(ole++); } memmove(newNodeValue + to, z + from + 1 , zlen - from); z = newNodeValue; zlen = zlen + overlen; if (textOrAtt->nodeType == ATTRIBUTE_NODE) { FREE (((domAttrNode*)textOrAtt)->nodeValue); ((domAttrNode*)textOrAtt)->nodeValue = z; ((domAttrNode*)textOrAtt)->valueLength = zlen; } else { FREE (textOrAtt->nodeValue); textOrAtt->nodeValue = z; textOrAtt->valueLength = zlen; } from = to; } else { z[to++] = z[from++]; } } } } else { z[to++] = z[from++]; } } z[to] = 0; if (textOrAtt->nodeType == ATTRIBUTE_NODE) { ((domAttrNode*)textOrAtt)->valueLength = to; } else { textOrAtt->valueLength = to; } } /*---------------------------------------------------------------------------- | End Of Character Entity Translator \---------------------------------------------------------------------------*/ |
︙ | ︙ | |||
2539 2540 2541 2542 2543 2544 2545 | tnode->nodeNumber = NODE_NO(doc); tnode->valueLength = (x - start); tnode->nodeValue = (char*)MALLOC((x - start)+1); memmove(tnode->nodeValue, start, (x - start)); *(tnode->nodeValue + (x - start)) = 0; DBG(fprintf(stderr, "New text node: '%s'\n", tnode->nodeValue);) if (ampersandSeen) { | | | 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 | tnode->nodeNumber = NODE_NO(doc); tnode->valueLength = (x - start); tnode->nodeValue = (char*)MALLOC((x - start)+1); memmove(tnode->nodeValue, start, (x - start)); *(tnode->nodeValue + (x - start)) = 0; DBG(fprintf(stderr, "New text node: '%s'\n", tnode->nodeValue);) if (ampersandSeen) { TranslateEntityRefs (tnode); } tnode->parentNode = parent_node; if (parent_node->firstChild) { parent_node->lastChild->nextSibling = (domNode*)tnode; tnode->previousSibling = parent_node->lastChild; parent_node->lastChild = (domNode*)tnode; } else { |
︙ | ︙ | |||
3091 3092 3093 3094 3095 3096 3097 | attrnode->nodeName = (char *)&(h->key); attrnode->nodeType = ATTRIBUTE_NODE; attrnode->nodeValue = (char*)MALLOC(nArgVal+1); attrnode->valueLength = nArgVal; memmove(attrnode->nodeValue, ArgVal, nArgVal); *(attrnode->nodeValue + nArgVal) = 0; if (ampersandSeen) { | | < | 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 | attrnode->nodeName = (char *)&(h->key); attrnode->nodeType = ATTRIBUTE_NODE; attrnode->nodeValue = (char*)MALLOC(nArgVal+1); attrnode->valueLength = nArgVal; memmove(attrnode->nodeValue, ArgVal, nArgVal); *(attrnode->nodeValue + nArgVal) = 0; if (ampersandSeen) { TranslateEntityRefs ((domTextNode*)attrnode); } if (!strcmp(ArgName, "id")) { if (!doc->ids) { doc->ids = (Tcl_HashTable *) MALLOC (sizeof (Tcl_HashTable)); Tcl_InitHashTable (doc->ids, TCL_STRING_KEYS); } |
︙ | ︙ |
Changes to tests/htmlreader.test.
︙ | ︙ | |||
73 74 75 76 77 78 79 | proc toutf8 c { set s [encoding convertto utf-8 $c] binary scan $s H* x regsub -all -expanded {..} $x {\x&} } | | | 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | proc toutf8 c { set s [encoding convertto utf-8 $c] binary scan $s H* x regsub -all -expanded {..} $x {\x&} } test html-1.8 {character entities} {Tcl9} { set result "" foreach {entity byteseq} { "AElig" "\\xC3\\x86" "AMP" "\\x26" "Aacute" "\\xC3\\x81" "Abreve" "\\xC4\\x82" "Acirc" "\\xC3\\x82" |
︙ | ︙ | |||
2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 | lappend result $entity [toutf8 [$body text]] [string tolower $byteseq] } $doc delete } set result } "" test html-2.1 {not closed p tags} { set doc [dom parse -html { <html><body><p>Para 1<p>Para 2<p>Para 3</body></html> }] set result [$doc asXML -indent none] $doc delete | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 | lappend result $entity [toutf8 [$body text]] [string tolower $byteseq] } $doc delete } set result } "" test html-1.9 {non-existing character entities} { set doc [dom parse -html {<html>&abcdef;</html>}] set root [$doc documentElement] set result [$root text] $doc delete set result } "&abcdef;" test html-1.10 {non-BMP character reference} {Tcl9} { set doc [dom parse -html {<html>𝒞𝒞</html>}] set root [$doc documentElement] set result [$root text] $doc delete set result } "\U1d49e\U1d49e" test html-1.11 {Prematur end of entity reference with overlong repacement} {Tcl9} { set doc [dom parse -html {<html>&nGt</html>}] set root [$doc documentElement] set result [$root text] $doc delete set result } "&nGt" test html-1.12 {Entity reference with overlong repacement} {Tcl9} { set doc [dom parse -html {<html>≫⃒</html>}] set root [$doc documentElement] set result [$root text] $doc delete set result } "\u226B\u20D2" test html-1.13 {Entity reference with overlong repacement} {Tcl9} { set doc [dom parse -html {<html>≫⃒abc≫⃒∡≪⃒≫⃒foobar</html>}] set root [$doc documentElement] set result [$root text] $doc delete set result } "\u226B\u20D2abc\u226B\u20D2\u2221\u226A\u20D2\u226B\u20D2foobar" test html-1.14 {Entity reference with overlong repacement} {Tcl9} { set doc [dom parse -html {<html>≫⃒abc</html>}] set root [$doc documentElement] set result [$root text] $doc delete set result } "\u226B\u20D2abc" test html-2.1 {not closed p tags} { set doc [dom parse -html { <html><body><p>Para 1<p>Para 2<p>Para 3</body></html> }] set result [$doc asXML -indent none] $doc delete |
︙ | ︙ |