Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Added the isHTML5CustomName method to the dom command. The simple HTML reader now accept HTML5 custom element names.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 9d8f28621bbf19a174ca8e3bb6c02ffa80dda5ed967dd2deac665ebd1ec2ae58
User & Date: rolf 2024-07-02 22:08:25
Context
2024-07-02
22:09
Updated the CHANGES file. check-in: d02dcd66ea user: rolf tags: trunk
22:08
Added the isHTML5CustomName method to the dom command. The simple HTML reader now accept HTML5 custom element names. check-in: 9d8f28621b user: rolf tags: trunk
21:58
Added the isHTML5CustomName method to the dom command. The simple HTML reader Uses the machinery to accept HTML5 custom element names. Closed-Leaf check-in: a2c9f83840 user: rolf tags: HTML5CustomNames
2024-07-01
11:42
Use the enhanced encoding capabilities of Tcl 9 better. And more changes as result of TIP 699. check-in: 0058351450 user: rolf tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/dom.c.

351
352
353
354
355
356
357









































































358
359
360
361
362
363
364
        if (!clen) return 0;
        if (UTF8_XMLCHAR((unsigned const char *)p,clen))
            p += clen;
        else return 0;
    }
    return 1;
}










































































/*---------------------------------------------------------------------------
|   domClearString
|
\--------------------------------------------------------------------------*/
void 
domClearString (







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
        if (!clen) return 0;
        if (UTF8_XMLCHAR((unsigned const char *)p,clen))
            p += clen;
        else return 0;
    }
    return 1;
}

/*---------------------------------------------------------------------------
|   domIsHTML5CustomName 
|
\--------------------------------------------------------------------------*/
int
domIsHTML5CustomName (
    const char *str
    )
{
    const char *p;
    int clen, dashseen = 0;
    Tcl_UniChar uniChar;

    p = str;
    if (*p < 'a' || *p > 'z') {
        return 0;
    }
    p++;
    while (*p) {
        clen = UTF8_CHAR_LEN(*p);
        if (clen == 0) return 0;
        if (clen == 1) {
            if (*p == '-') {
                dashseen = 1;
                p++;
            } else {
                if (*p == '.'
                    || (*p >= '0' && *p <= '9')
                    || *p == '_'
                    || (*p >= 'a' && *p <= 'z')) {
                    p++;
                } else {
                    return 0;
                }
            }
            continue;
        } 
        if (clen == 4) {
            p += clen;
            continue;
        }
        clen = Tcl_UtfToUniChar (p, &uniChar);
        if (uniChar == 0xB7
            || (uniChar >= 0xC0 && uniChar <= 0xD6)
            || (uniChar >= 0xD8 && uniChar <= 0xF6) 
            || (uniChar >= 0xF8 && uniChar <= 0x37D) 
            || (uniChar >= 0x37F && uniChar <= 0x1FFF) 
            || (uniChar >= 0x200C && uniChar <= 0x200D) 
            || (uniChar >= 0x203F && uniChar <= 0x2040)
            || (uniChar >= 0x2070 && uniChar <= 0x218F)
            || (uniChar >= 0x2C00 && uniChar <= 0x2FEF)
            || (uniChar >= 0x3001 && uniChar <= 0xD7FF)
            || (uniChar >= 0xF900 && uniChar <= 0xFDCF)
            || (uniChar >= 0xFDF0 && uniChar <= 0xFFFD)) {
            p += clen;
        } else {
            return 0;
        }
    }
    if (!dashseen) return 0;
    switch (str[0]) {
    case 'a': if (!strcmp(str,"annotation-xml")) {return 0;} break;
    case 'c': if (!strcmp(str,"color-profile")) {return 0;} break;
    case 'f': if (!strcmp(str,"font-face")        ||
                  !strcmp(str,"font-face-src")    ||
                  !strcmp(str,"font-face-uri")    ||
                  !strcmp(str,"font-face-format") ||
                  !strcmp(str,"font-face-name")) {return 0;} break;
    case 'm': if (!strcmp(str,"missing-glyph")) {return 0;} break;
    }
    return 1;
}

/*---------------------------------------------------------------------------
|   domClearString
|
\--------------------------------------------------------------------------*/
void 
domClearString (

Changes to generic/dom.h.

912
913
914
915
916
917
918

919
920
921
922
923
924
925
int            domIsChar (const char *str);
void           domClearString (char *str, char *replacement, domLength repllen,
                               Tcl_DString *clearedstr, int *changed);
int            domIsBMPChar (const char *str);
int            domIsComment (const char *str);
int            domIsCDATA (const char *str);
int            domIsPIValue (const char *str);

void           domCopyTo (domNode *node, domNode *parent, int copyNS);
void           domCopyNS (domNode *from, domNode *to);
domAttrNode *  domCreateXMLNamespaceNode (domNode *parent);
void           domRenumberTree (domNode *node);
int            domPrecedes (domNode *node, domNode *other);
void           domNormalize (domNode *node, int forXPath, 
                             domFreeCallback freeCB, void *clientData);







>







912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
int            domIsChar (const char *str);
void           domClearString (char *str, char *replacement, domLength repllen,
                               Tcl_DString *clearedstr, int *changed);
int            domIsBMPChar (const char *str);
int            domIsComment (const char *str);
int            domIsCDATA (const char *str);
int            domIsPIValue (const char *str);
int            domIsHTML5CustomName (const char *str);
void           domCopyTo (domNode *node, domNode *parent, int copyNS);
void           domCopyNS (domNode *from, domNode *to);
domAttrNode *  domCreateXMLNamespaceNode (domNode *parent);
void           domRenumberTree (domNode *node);
int            domPrecedes (domNode *node, domNode *other);
void           domNormalize (domNode *node, int forXPath, 
                             domFreeCallback freeCB, void *clientData);

Changes to generic/domhtml.c.

2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
    int            hasContent;
    domNode       *pnode;
    domNode       *node = NULL, *parent_node = parent;
    domTextNode   *tnode;
    domAttrNode   *attrnode, *lastAttr;
    int            ampersandSeen = 0;
    int            only_whites   = 0;
    int            hnew, autoclose, ignore;
    char           tmp[250], *y = NULL;
    Tcl_HashEntry *h;
    domProcessingInstructionNode *pinode;

    x = &(html[*pos]);

    while ( (c=*x)!=0 ) {







|







2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
    int            hasContent;
    domNode       *pnode;
    domNode       *node = NULL, *parent_node = parent;
    domTextNode   *tnode;
    domAttrNode   *attrnode, *lastAttr;
    int            ampersandSeen = 0;
    int            only_whites   = 0;
    int            hnew, autoclose, ignore, maybeCustomName, rc;
    char           tmp[250], *y = NULL;
    Tcl_HashEntry *h;
    domProcessingInstructionNode *pinode;

    x = &(html[*pos]);

    while ( (c=*x)!=0 ) {
2974
2975
2976
2977
2978
2979
2980

2981
2982







2983
2984










2985
2986
2987
2988
2989
2990
2991
            }


            /*----------------------------------------------------------------
            |   new tag/element
            |
            \---------------------------------------------------------------*/

            while ((c=*x)!=0 && c!='/' && c!='>' && c!='<' && !SPACE(c) ) {
                if (!isalnum(c)) goto readText;







                *x = tolower(c);
                x++;










            }
            hasContent = 1;
            if (c==0) {
                RetError("Missing \">\"",(start-html) );
            }
            if ( (x-start)==1) {
                RetError("Null markup name",(start-html) );







>

|
>
>
>
>
>
>
>
|
|
>
>
>
>
>
>
>
>
>
>







2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
            }


            /*----------------------------------------------------------------
            |   new tag/element
            |
            \---------------------------------------------------------------*/
            maybeCustomName = 0;
            while ((c=*x)!=0 && c!='/' && c!='>' && c!='<' && !SPACE(c) ) {
                if (!maybeCustomName && !isalnum(c)) maybeCustomName = 1;
                x++;
            }
            if (!maybeCustomName) {
                c = *x;
                *x = '\0'; /* temporarily terminate the string */
                x = start + 1;
                while (*x) {
                    *x = tolower(*x);
                    x++;
                }
                *x = c;
            } else {
                c = *x;
                *x = '\0'; /* temporarily terminate the string */
                rc = domIsHTML5CustomName (start+1);
                *x = c;
                if (!rc) {
                    goto readText;
                }
            }
            hasContent = 1;
            if (c==0) {
                RetError("Missing \">\"",(start-html) );
            }
            if ( (x-start)==1) {
                RetError("Null markup name",(start-html) );

Changes to generic/tcldom.c.

224
225
226
227
228
229
230

231
232
233
234
235
236
237
    "    isComment string                                 \n"
    "    isCDATA string                                   \n"
    "    isPIValue string                                 \n"
    "    isName string                                    \n"
    "    isQName string                                   \n"
    "    isNCName string                                  \n"
    "    isPIName string                                  \n"

    "    featureinfo feature                              \n"
;

static char doc_usage[] =
    "Usage domDoc <method> <args>, where method can be:\n"
    "    documentElement ?objVar?                \n"
    "    getElementsByTagName name               \n"







>







224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    "    isComment string                                 \n"
    "    isCDATA string                                   \n"
    "    isPIValue string                                 \n"
    "    isName string                                    \n"
    "    isQName string                                   \n"
    "    isNCName string                                  \n"
    "    isPIName string                                  \n"
    "    isHTML5CustomName string                         \n"
    "    featureinfo feature                              \n"
;

static char doc_usage[] =
    "Usage domDoc <method> <args>, where method can be:\n"
    "    documentElement ?objVar?                \n"
    "    getElementsByTagName name               \n"
7630
7631
7632
7633
7634
7635
7636

7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649

7650
7651
7652
7653
7654
7655
7656
        "createDocument",  "createDocumentNS",   "createNodeCmd",
        "parse",                                 "setStoreLineColumn",
        "isCharData",      "isName",             "isPIName",
        "isQName",         "isComment",          "isCDATA",
        "isPIValue",       "isNCName",           "createDocumentNode",
        "setNameCheck",    "setTextCheck",       "setObjectCommands",
        "featureinfo",     "isBMPCharData",      "clearString",

#ifdef TCL_THREADS
        "attachDocument",  "detachDocument",
#endif
        NULL
    };
    enum domMethod {
        m_createDocument,    m_createDocumentNS,   m_createNodeCmd,
        m_parse,                                   m_setStoreLineColumn,
        m_isCharData,        m_isName,             m_isPIName,
        m_isQName,           m_isComment,          m_isCDATA,
        m_isPIValue,         m_isNCName,           m_createDocumentNode,
        m_setNameCheck,      m_setTextCheck,       m_setObjectCommands,
        m_featureinfo,       m_isBMPCharData,      m_clearString

#ifdef TCL_THREADS
        ,m_attachDocument,   m_detachDocument
#endif
    };

    static const char *nodeModeValues[] = {
        "automatic", "command", "token", NULL







>












|
>







7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
        "createDocument",  "createDocumentNS",   "createNodeCmd",
        "parse",                                 "setStoreLineColumn",
        "isCharData",      "isName",             "isPIName",
        "isQName",         "isComment",          "isCDATA",
        "isPIValue",       "isNCName",           "createDocumentNode",
        "setNameCheck",    "setTextCheck",       "setObjectCommands",
        "featureinfo",     "isBMPCharData",      "clearString",
        "isHTML5CustomName",
#ifdef TCL_THREADS
        "attachDocument",  "detachDocument",
#endif
        NULL
    };
    enum domMethod {
        m_createDocument,    m_createDocumentNS,   m_createNodeCmd,
        m_parse,                                   m_setStoreLineColumn,
        m_isCharData,        m_isName,             m_isPIName,
        m_isQName,           m_isComment,          m_isCDATA,
        m_isPIValue,         m_isNCName,           m_createDocumentNode,
        m_setNameCheck,      m_setTextCheck,       m_setObjectCommands,
        m_featureinfo,       m_isBMPCharData,      m_clearString,
        m_isHTML5CustomName
#ifdef TCL_THREADS
        ,m_attachDocument,   m_detachDocument
#endif
    };

    static const char *nodeModeValues[] = {
        "automatic", "command", "token", NULL
7880
7881
7882
7883
7884
7885
7886





7887
7888
7889
7890
7891
7892
7893
            return tcldom_featureinfo(clientData, interp, --objc, objv+1);

        case m_isBMPCharData:
            CheckArgs(3,3,2,"string");
            SetBooleanResult(domIsBMPChar(Tcl_GetString(objv[2])));
            return TCL_OK;






        case m_clearString:
            CheckArgs(3,5,2,"?-replace ?replacement?? string");
            if (objc >= 4) {
                option = Tcl_GetString (objv[2]);
                if (option[0] == '-' && option[1] == 'r') {
                    if (Tcl_GetIndexFromObj (interp, objv[2],
                                             clearStringOptions, "option",







>
>
>
>
>







7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
            return tcldom_featureinfo(clientData, interp, --objc, objv+1);

        case m_isBMPCharData:
            CheckArgs(3,3,2,"string");
            SetBooleanResult(domIsBMPChar(Tcl_GetString(objv[2])));
            return TCL_OK;

        case m_isHTML5CustomName:
            CheckArgs(3,3,2,"string");
            SetBooleanResult(domIsHTML5CustomName(Tcl_GetString(objv[2])));
            return TCL_OK;
        
        case m_clearString:
            CheckArgs(3,5,2,"?-replace ?replacement?? string");
            if (objc >= 4) {
                option = Tcl_GetString (objv[2]);
                if (option[0] == '-' && option[1] == 'r') {
                    if (Tcl_GetIndexFromObj (interp, objv[2],
                                             clearStringOptions, "option",

Changes to tests/dom.test.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Features covered: dom command
#
# This file contains a collection of tests for the dom command of
# tDOM.
#
#    dom-1.*:  createDocument, createDocumentNS
#    dom-2.*:  parse
#    dom-3.*:  isName, isNCName, isCharData, isPIName, isComment, isCDATA
#    dom-4.*:  parse -useForeignDTD
#    dom-5.*:  external entities
#    dom-6.*:  use in slave interpreter
#    dom-7.*:  setNameCheck, setTextCheck
#    dom-8.*:  createDocumentNode, documentNodes
#    dom-9.*:  setObjectCommands
#    dom-10.*: createNodeCmd







|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Features covered: dom command
#
# This file contains a collection of tests for the dom command of
# tDOM.
#
#    dom-1.*:  createDocument, createDocumentNS
#    dom-2.*:  parse
#    dom-3.*:  is* methods, clearString
#    dom-4.*:  parse -useForeignDTD
#    dom-5.*:  external entities
#    dom-6.*:  use in slave interpreter
#    dom-7.*:  setNameCheck, setTextCheck
#    dom-8.*:  createDocumentNode, documentNodes
#    dom-9.*:  setObjectCommands
#    dom-10.*: createNodeCmd
1183
1184
1185
1186
1187
1188
1189


























1190
1191
1192
1193
1194
1195
1196
        \u0004\u0005\u0001
    } {
        lappend result [dom clearString -replace some $str]
    }
    set result
} [list some asome someb asomeb asomesomeb asomecsomeb asomedsomesomeb asomedsomesomesomesome_foo_bar somesomesome_foo_bar_bazsome_didumsome somesomesome_foo_bar_bazsome_didum\uE000 somesomesome abc somesomesome]



























test dom-4.1 {-useForeignDTD 0} {
    set doc [dom parse -useForeignDTD 0 {<root/>}]
    $doc delete
} {}

test dom-4.2 {-useForeignDTD 1 with document with internal subset} {need_uri} {
    set baseURI [tdom::baseURL [file join [pwd] [file dir [info script]] dom.test]]







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
        \u0004\u0005\u0001
    } {
        lappend result [dom clearString -replace some $str]
    }
    set result
} [list some asome someb asomeb asomesomeb asomecsomeb asomedsomesomeb asomedsomesomesomesome_foo_bar somesomesome_foo_bar_bazsome_didumsome somesomesome_foo_bar_bazsome_didum\uE000 somesomesome abc somesomesome]

test dom-3.46 {isHTML5CustomName} {
    set result [list]
    foreach str {
        abc
        ab-c
        ab-C
        -ab
        äb-c
        ab---c
        ab-
        ab-ᴨ
        aÄ-b
        aA-b
        aä-b
        0n-foo
        A-b
        font-face
        m\u00B7-.
        n-\uFDF0
        o-\u3000
    } {
        lappend result [dom isHTML5CustomName $str]
    }
    set result
} {0 1 0 0 0 1 1 1 1 0 1 0 0 0 1 1 0}

test dom-4.1 {-useForeignDTD 0} {
    set doc [dom parse -useForeignDTD 0 {<root/>}]
    $doc delete
} {}

test dom-4.2 {-useForeignDTD 1 with document with internal subset} {need_uri} {
    set baseURI [tdom::baseURL [file join [pwd] [file dir [info script]] dom.test]]

Changes to tests/htmlreader.test.

2395
2396
2397
2398
2399
2400
2401






















2402
2403
2404
2405
2406
2407
2408
<head><title></title></head><body><form><select id="L" name="nls_language">
<option value="">--
        </option><option selected="selected" value="en_US">en_US
        </option><option value="de_DE">de_DE
        </option>
</select></form></body>
</html>}























test html-3.1 {Bad data} {
    set data {line 6 column 17 - Warning: <script> lacks "type" attribute
line 10 column 17 - Warning: <script> lacks "type" attribute
        line 11 column 17 - Warning: <table> lacks "summary" attribute}
    set doc [dom parse -html $data]
    set result [$doc asHTML]







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
<head><title></title></head><body><form><select id="L" name="nls_language">
<option value="">--
        </option><option selected="selected" value="en_US">en_US
        </option><option value="de_DE">de_DE
        </option>
</select></form></body>
</html>}

test html-2.9 {HTML parsing} {knownBug} {
    set doc [dom parse -html {<HTML><MATH-field>x = 1 + 1</MATH-field></HTML>}]
    set result [$doc asHTML]
    $doc delete
    set result
} {<html>&lt;MATH-field&gt;x = 1 + 1&lt;/MATH-field&gt;</html&gt;}

test html-2.10 {HTML parsing} {
    set doc [dom parse -html {<HTML><mytag>foo</mytag></HTML>}]
    set result [$doc asHTML]
    $doc delete
    set result
} {<html><mytag>foo</mytag></html>}

test html-2.11 {HTML parsing} {
    set doc [dom parse -html {<HTML><math-field>x = 1 + 1</math-field></HTML>}]
    set result [$doc asHTML]
    $doc delete
    set result
} {<html><math-field>x = 1 + 1</math-field></html>}


test html-3.1 {Bad data} {
    set data {line 6 column 17 - Warning: <script> lacks "type" attribute
line 10 column 17 - Warning: <script> lacks "type" attribute
        line 11 column 17 - Warning: <table> lacks "summary" attribute}
    set doc [dom parse -html $data]
    set result [$doc asHTML]