Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Added code which handles outside BMP codepoints even for Tcl 8.6 in case of entity serialization.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | wip
Files: files | file ages | folders
SHA3-256: 96d934bb241f1e2aace3ce0a82fc9bc653a65b22c2e8fdd32128b19e741eca09
User & Date: rolf 2024-06-26 01:32:29
Context
2024-06-26
01:48
Updated to HTML 5 entities also for serializating with -htmlEntities. Closed-Leaf check-in: cb90628bb3 user: rolf tags: HTML5Entities
01:32
Added code which handles outside BMP codepoints even for Tcl 8.6 in case of entity serialization. Closed-Leaf check-in: 96d934bb24 user: rolf tags: wip
2024-06-24
23:39
Merged from trunk. check-in: 91f818a493 user: rolf tags: wip
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tcldom.c.

2376
2377
2378
2379
2380
2381
2382




































2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415

    objv[0] = name;
    objv[1] = attrsList;
    objv[2] = childList;

    return Tcl_NewListObj(3, objv);
}





































/*----------------------------------------------------------------------------
|   tcldom_AppendEscaped
|
\---------------------------------------------------------------------------*/
static
void tcldom_AppendEscaped (
    Tcl_Obj    *xmlString,
    Tcl_Channel chan,
    char       *value,
    domLength   value_length,
    int         outputFlags
)
{
#define APESC_BUF_SIZE 512
#define AP(c)  *b++ = c;
#define AE(s)  pc1 = s; while(*pc1) *b++ = *pc1++;
#define TWOCPE clen2 = UTF8_CHAR_LEN(*(pc+clen)); \
    if (clen) Tcl_UtfToUniChar(pc+clen, &uniChar2);
#define MCP    pc += clen; clen = clen2;
    char  buf[APESC_BUF_SIZE+80], *b, *bLimit,  *pc, *pc1, *pEnd,
          charRef[10];
    int   charDone, i;
    int   clen = 0, clen2 = 0;
    int   unicode;
    Tcl_UniChar uniChar, uniChar2;
    
    b = buf;
    bLimit = b + APESC_BUF_SIZE;
    pc = pEnd = value;
    if (value_length != -1) {
        pEnd = pc + value_length;
    }







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


















|






|







2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451

    objv[0] = name;
    objv[1] = attrsList;
    objv[2] = childList;

    return Tcl_NewListObj(3, objv);
}

#if TCL_MAJOR_VERSION < 9
static
int tcldom_UtfToUniChar (
    const char *src,
    int *uniChar
    )
{
    int clen;
    Tcl_UniChar uni16;

    clen = UTF8_CHAR_LEN(*src);
    if (clen && clen < 4) {
        clen = Tcl_UtfToUniChar (src, &uni16);
        *uniChar = uni16;
        return clen;
    } else if (clen == 4) {
        /* This resembles exactly what Tcl 9 does */
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)
            && ((src[3] & 0xC0) == 0x80)) {
	    /*
	     * Four-byte-character lead byte followed by three trail bytes.
	     */
	    *uniChar = (((src[0] & 0x07) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
	    if ((unsigned)(*uniChar - 0x10000) <= 0xFFFFF) {
		return 4;
	    }
	}
    }
    *uniChar = src[0];
    return 1;
}
#else
# define tcldom_UtfToUniChar Tcl_UtfToUniChar
#endif

/*----------------------------------------------------------------------------
|   tcldom_AppendEscaped
|
\---------------------------------------------------------------------------*/
static
void tcldom_AppendEscaped (
    Tcl_Obj    *xmlString,
    Tcl_Channel chan,
    char       *value,
    domLength   value_length,
    int         outputFlags
)
{
#define APESC_BUF_SIZE 512
#define AP(c)  *b++ = c;
#define AE(s)  pc1 = s; while(*pc1) *b++ = *pc1++;
#define TWOCPE clen2 = UTF8_CHAR_LEN(*(pc+clen)); \
    if (clen) tcldom_UtfToUniChar(pc+clen, &uniChar2);
#define MCP    pc += clen; clen = clen2;
    char  buf[APESC_BUF_SIZE+80], *b, *bLimit,  *pc, *pc1, *pEnd,
          charRef[10];
    int   charDone, i;
    int   clen = 0, clen2 = 0;
    int   unicode;
    int   uniChar, uniChar2;
    
    b = buf;
    bLimit = b + APESC_BUF_SIZE;
    pc = pEnd = value;
    if (value_length != -1) {
        pEnd = pc + value_length;
    }
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
            AP('&') AP('#') AP('x') AP('9') AP(';')
        } else 
        {
            charDone = 0;
            clen = UTF8_CHAR_LEN(*pc);
            if (outputFlags & SERIALIZE_HTML_ENTITIES) {
                charDone = 1;
                Tcl_UtfToUniChar(pc, &uniChar);
                switch (uniChar) {
                    #include "HTML5ent.inc"
                default: charDone = 0; 
                }
                if (charDone) {
                    pc += (clen - 1);
                }







|







2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
            AP('&') AP('#') AP('x') AP('9') AP(';')
        } else 
        {
            charDone = 0;
            clen = UTF8_CHAR_LEN(*pc);
            if (outputFlags & SERIALIZE_HTML_ENTITIES) {
                charDone = 1;
                tcldom_UtfToUniChar(pc, &uniChar);
                switch (uniChar) {
                    #include "HTML5ent.inc"
                default: charDone = 0; 
                }
                if (charDone) {
                    pc += (clen - 1);
                }

Changes to tests/domDoc.test.

93
94
95
96
97
98
99







100
101
102
103
104
105
106

test domDoc-1.3.4 {asHTML -htmlEntities} {
    set doc [dom parse -html "<html>&NotLessLess;&ll;&nLt;\u226A\u20D2</html>"]
    set result [$doc asHTML -htmlEntities]
    $doc delete
    set result
} {<html>&nLtv;&ll;&nLt;&nLt;</html>}








set doc [dom parse <root/>]

test domDoc-1.4 {asXML -doctypeDeclaration} {
    $doc asXML -doctypeDeclaration 1
} {<!DOCTYPE root>
<root/>







>
>
>
>
>
>
>







93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

test domDoc-1.3.4 {asHTML -htmlEntities} {
    set doc [dom parse -html "<html>&NotLessLess;&ll;&nLt;\u226A\u20D2</html>"]
    set result [$doc asHTML -htmlEntities]
    $doc delete
    set result
} {<html>&nLtv;&ll;&nLt;&nLt;</html>}

test domDoc-1.3.3 {asHTML -htmlEntities} {
    set doc [dom parse -html "<html>&zopf;</html>"]
    set result [$doc asHTML -htmlEntities]
    $doc delete
    set result
} {<html>&zopf;</html>}

set doc [dom parse <root/>]

test domDoc-1.4 {asXML -doctypeDeclaration} {
    $doc asXML -doctypeDeclaration 1
} {<!DOCTYPE root>
<root/>