Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch wip Excluding Merge-Ins

This is equivalent to a diff from fa8d38c58f to ed9de65f55

2024-02-26
00:37
wip Leaf check-in: ed9de65f55 user: rolf tags: wip
2024-02-24
02:46
wip check-in: 69c8348852 user: rolf tags: wip
2024-02-23
01:36
Updated recognized HTML entities. Leaf check-in: fa8d38c58f user: rolf tags: HTML5Entities
01:18
Merged from trunk. check-in: c4776de175 user: rolf tags: HTML5Entities

Changes to generic/domhtml.c.

2301
2302
2303
2304
2305
2306
2307
2308
2309







2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321

2322
2323

2324
2325
2326
2327
2328
2329
2330










2331
2332
2333
2334
2335
2336
2337
2301
2302
2303
2304
2305
2306
2307


2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324


2325
2326
2327
2328
2329
2330
2331
2332
2333


2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350







-
-
+
+
+
+
+
+
+










-
-
+


+





-
-
+
+
+
+
+
+
+
+
+
+








} /* ErInit */


/*----------------------------------------------------------------------------
|    TranslateEntityRefs  --
|
|        Translate entity references and character references in the string
|        "z".  "z" is overwritten with the translated sequence.
|        Translate entity references and character references in the
|        nodeValue of the domTextNode or domAttrNode given as
|        argument. Since all character references and almost all
|        entity references are longer or equal in length as the
|        referenced UTF-8 byte sequence the translation is basically
|        done by rewriting the nodeValue in place and with special
|        handling of the few conterexamples.
|
|        Unrecognized entity references are unaltered.
|
|        Example:
|
|          input =    "AT&T &gt MCI"
|          output =   "AT&T > MCI"
|
\---------------------------------------------------------------------------*/
static void TranslateEntityRefs (
    char *z,
    domLength  *newLen
    domTextNode *textOrAtt
)
{
    char *z;     /* Pointer to nodeValue to rewrite */
    int from;    /* Read characters from this position in z[] */
    int to;      /* Write characters into this position in z[] */
    int h;       /* A hash on the entity reference */
    char *zVal;  /* The substituted value */
    Er *p;       /* For looping down the entity reference collision chain */
    int value;

    int value, zlen, overlen; 
    char *ole, *newNodeValue;
    
    if (textOrAtt->nodeType == ATTRIBUTE_NODE) {
        z = ((domAttrNode*)textOrAtt)->nodeValue;
        zlen = ((domAttrNode*)textOrAtt)->valueLength;
    } else {
        z = textOrAtt->nodeValue;
        zlen = textOrAtt->valueLength;
    }
    from = to = 0;

    if (bErNeedsInit) {
        TDomThreaded(Tcl_MutexLock(&initMutex);)
        if (bErNeedsInit) {
            ErInit();
            bErNeedsInit = 0;
2414
2415
2416
2417
2418
2419
2420
2421




2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434













2435
2436
2437
2438
2439
2440
2441
2442
2443
2444























2445


2446
2447
2448
2449
2450
2451
2452



2453


2454
2455
2456
2457
2458
2459
2460
2427
2428
2429
2430
2431
2432
2433

2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496

2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508

2509
2510
2511
2512
2513
2514
2515
2516
2517







-
+
+
+
+













+
+
+
+
+
+
+
+
+
+
+
+
+










+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+







+
+
+
-
+
+







                    z[to++] = (char) ((value >> 6) | 0xC0);
                    z[to++] = (char) ((value | 0x80) & 0xBF);
                } else if (value <= 0xFFFF) {
                    z[to++] = (char) ((value >> 12) | 0xE0);
                    z[to++] = (char) (((value >> 6) | 0x80) & 0xBF);
                    z[to++] = (char) ((value | 0x80) & 0xBF);
                } else {
                    /* error */
                    z[to++] = (char) ((value >> 18) | 0xf0);
                    z[to++] = (char) (((value >> 12) & 0x3f) | 0x80);
                    z[to++] = (char) (((value >> 6) & 0x3f) | 0x80);
                    z[to++] = (char) ((value & 0x3f) | 0x80);
                }
		    from = i+1;
		}
            } else {
                while (z[i] && isalnum((unsigned char)z[i])) {
                   i++;
                }
                c = z[i];
                z[i] = 0;
                h = ErHash(&z[from+1]);
                p = apErHash[h];
                while (p && strcmp(p->zName,&z[from+1])!=0 ) {
                    p = p->pNext;
                }
                ole = NULL;
                if (!p && c == ';') {
                    /* Entity name not found. It may be one of the few
                     * entities with a referenced UTF-8 byte sequence
                     * which is longer than the reference. */
                    if (strcmp("nGt",&z[from+1]) == 0) {
                        ole = "\xE2\x89\xAB\xE2\x83\x92\x00";
                        overlen = 1;
                    } else if (strcmp("nLt",&z[from+1]) == 0) {
                        ole = "\xE2\x89\xAA\xE2\x83\x92\x00";
                        overlen = 1;
                    }
                }
                z[i] = c;
                if (p) {
                    zVal = p->zValue;
                    while (*zVal) {
                        z[to++] = *(zVal++);
                    }
                    from = i;
                    if (c==';') from++;
                } else {
                    if (ole) {
                        /* Over-long entity reference */
                        from = i;
                        newNodeValue = MALLOC(zlen + 1 + overlen);
                        memmove(newNodeValue,z,to);
                        while (*ole) {
                            newNodeValue[to++] = *(ole++);
                        }
                        memmove(newNodeValue + to, z + from + 1 ,
                                zlen - from);
                        z = newNodeValue;
                        zlen = zlen + overlen;
                        if (textOrAtt->nodeType == ATTRIBUTE_NODE) {
                            FREE (((domAttrNode*)textOrAtt)->nodeValue);
                            ((domAttrNode*)textOrAtt)->nodeValue = z;
                            ((domAttrNode*)textOrAtt)->valueLength = zlen;
                        } else {
                            FREE (textOrAtt->nodeValue);
                            textOrAtt->nodeValue = z;
                            textOrAtt->valueLength = zlen;
                        }
                        from = to;
                    } else {
                    z[to++] = z[from++];
                        z[to++] = z[from++];
                    }
                }
            }
        } else {
            z[to++] = z[from++];
        }
    }
    z[to] = 0;
    if (textOrAtt->nodeType == ATTRIBUTE_NODE) {
        ((domAttrNode*)textOrAtt)->valueLength = to;
    } else {
    *newLen = to;
        textOrAtt->valueLength = to;
    }
}
/*----------------------------------------------------------------------------
|   End Of Character Entity Translator
\---------------------------------------------------------------------------*/



2539
2540
2541
2542
2543
2544
2545
2546

2547
2548
2549
2550
2551
2552
2553
2596
2597
2598
2599
2600
2601
2602

2603
2604
2605
2606
2607
2608
2609
2610







-
+







                tnode->nodeNumber  = NODE_NO(doc);
                tnode->valueLength = (x - start);
                tnode->nodeValue   = (char*)MALLOC((x - start)+1);
                memmove(tnode->nodeValue, start, (x - start));
                *(tnode->nodeValue + (x - start)) = 0;
                DBG(fprintf(stderr, "New text node: '%s'\n", tnode->nodeValue);)
                if (ampersandSeen) {
                    TranslateEntityRefs(tnode->nodeValue, &(tnode->valueLength) );
                    TranslateEntityRefs (tnode);
                }
                tnode->parentNode = parent_node;
                if (parent_node->firstChild)  {
                    parent_node->lastChild->nextSibling = (domNode*)tnode;
                    tnode->previousSibling = parent_node->lastChild;
                    parent_node->lastChild = (domNode*)tnode;
                } else {
3091
3092
3093
3094
3095
3096
3097
3098

3099
3100
3101
3102
3103
3104
3105
3106
3148
3149
3150
3151
3152
3153
3154

3155

3156
3157
3158
3159
3160
3161
3162







-
+
-







                attrnode->nodeName    = (char *)&(h->key);
                attrnode->nodeType    = ATTRIBUTE_NODE;
                attrnode->nodeValue   = (char*)MALLOC(nArgVal+1);
                attrnode->valueLength = nArgVal;
                memmove(attrnode->nodeValue, ArgVal, nArgVal);
                *(attrnode->nodeValue + nArgVal) = 0;
                if (ampersandSeen) {
                    TranslateEntityRefs(attrnode->nodeValue, 
                    TranslateEntityRefs ((domTextNode*)attrnode);
                                        &(attrnode->valueLength) );
                }
                if (!strcmp(ArgName, "id")) {
                    if (!doc->ids) {
                        doc->ids = 
                            (Tcl_HashTable *) MALLOC (sizeof (Tcl_HashTable));
                        Tcl_InitHashTable (doc->ids, TCL_STRING_KEYS);
                    }

Changes to tests/htmlreader.test.

73
74
75
76
77
78
79
80

81
82
83
84
85
86
87
73
74
75
76
77
78
79

80
81
82
83
84
85
86
87







-
+








proc toutf8 c {
    set s [encoding convertto utf-8 $c]
    binary scan $s H* x
    regsub -all -expanded {..} $x {\x&}
}

test html-1.8 {character entities} {
test html-1.8 {character entities} {Tcl9} {
    set result ""
    foreach {entity byteseq} {
        "AElig"                           "\\xC3\\x86"
        "AMP"                             "\\x26"
        "Aacute"                          "\\xC3\\x81"
        "Abreve"                          "\\xC4\\x82"
        "Acirc"                           "\\xC3\\x82"
2212
2213
2214
2215
2216
2217
2218















































2219
2220
2221
2222
2223
2224
2225
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







            lappend result $entity [toutf8 [$body text]] [string tolower $byteseq]
        }
        $doc delete
    }
    set result
} ""

test html-1.9 {non-existing character entities} {
    set doc [dom parse -html {<html>&abcdef;</html>}]
    set root [$doc documentElement]
    set result [$root text]
    $doc delete
    set result
} "&abcdef;"

test html-1.10 {non-BMP character reference} {Tcl9} {
    set doc [dom parse -html {<html>&#119966;&#x1d49e;</html>}]
    set root [$doc documentElement]
    set result [$root text]
    $doc delete
    set result
} "\U1d49e\U1d49e"

test html-1.11 {Prematur end of entity reference with overlong repacement} {Tcl9} {
    set doc [dom parse -html {<html>&nGt</html>}]
    set root [$doc documentElement]
    set result [$root text]
    $doc delete
    set result
} "&nGt"

test html-1.12 {Entity reference with overlong repacement} {Tcl9} {
    set doc [dom parse -html {<html>&nGt;</html>}]
    set root [$doc documentElement]
    set result [$root text]
    $doc delete
    set result
} "\u226B\u20D2"

test html-1.13 {Entity reference with overlong repacement} {Tcl9} {
    set doc [dom parse -html {<html>&nGt;abc&nGt;&measuredangle;&nLt;&nGt;foobar</html>}]
    set root [$doc documentElement]
    set result [$root text]
    $doc delete
    set result
} "\u226B\u20D2abc\u226B\u20D2\u2221\u226A\u20D2\u226B\u20D2foobar"

test html-1.14 {Entity reference with overlong repacement} {Tcl9} {
    set doc [dom parse -html {<html>&nGt;abc</html>}]
    set root [$doc documentElement]
    set result [$root text]
    $doc delete
    set result
} "\u226B\u20D2abc"

test html-2.1 {not closed p tags} {
    set doc [dom parse -html {
        <html><body><p>Para 1<p>Para 2<p>Para 3</body></html>
    }]
    set result [$doc asXML -indent none]
    $doc delete