Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Implemented user defined replacement.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | clearstring
Files: files | file ages | folders
SHA3-256: 3d780d75630d8c8d72bfa43c73c793e8b0c0f69bb4aa1836190d315fd5d68b75
User & Date: rolf 2024-05-20 17:17:42
Context
2024-05-20
23:20
Minor code impovements and more tests. Closed-Leaf check-in: 522169e2ea user: rolf tags: clearstring
17:17
Implemented user defined replacement. check-in: 3d780d7563 user: rolf tags: clearstring
2024-05-17
00:18
Updated to expat 2.6.2. check-in: e37386009b user: rolf tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/dom.c.

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363

364
365
366
367
368


369
370
371
372
373
374
375
376
377
378
379


380
381
382
383
384
385
386











387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407

408

409
410
411
412
413
414
415
416

417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
{
    const char *p;
    int   clen;
    
    p = str;
    while (*p) {
        clen = UTF8_CHAR_LEN(*p);
        if (clen > 4) return 0;
        if (UTF8_XMLCHAR((unsigned const char *)p,clen))
            p += clen;
        else return 0;
    }
    return 1;
}

/*---------------------------------------------------------------------------
|   domClearString
|
\--------------------------------------------------------------------------*/
char *

domClearString (
    char *str,
    int *haveToFree,
    int replace,
    domLength length


    )
{
    const char *s;
    char *p, *clearedstr;
    int   clen, i, rewrite = 0;
    
    p = str;
    while (*p) {
        clen = UTF8_CHAR_LEN(*p);
        if (clen > 4 || !UTF8_XMLCHAR((unsigned const char*)p,clen)) {
            rewrite = 1;


            break;
        }
        p += clen;
    }
    if (!rewrite) {
        *haveToFree = 0;
        return str;











    }
    s = p;
    if (replace) {
        /* Worst case: every char from the first illegal one up to the
         * end are a single byte one and will be replaced with a three
         * byte one. So we need (to the one that is already included
         * in length two additional bytes for every outstandig
         * char. */
        clearedstr = MALLOC (sizeof(char) * (length+(length-(s-str))*2)+1);
    } else {
        /* We have at least on code-point to skip so lenght alone will
         * be enoug for the result string including the closing \0. */
        clearedstr = MALLOC (sizeof(char) * length);
    }
    memcpy (clearedstr, str, (s-str));
    p = clearedstr + (s-str);
    str += (s-str);
    if (replace) {
        *p = '\xEF'; p++;
        *p = '\xBF'; p++;
        *p = '\xBD'; p++;

    }

    str += clen;
    while (*str) {
        clen = UTF8_CHAR_LEN(*str);
        if (clen <= 4 && UTF8_XMLCHAR((unsigned const char*)str,clen)) {
            for (i = 0; i < clen; i++) {
                *p = *str;
                p++; str++;
            }

        } else {
            if (replace) {
                *p = '\xEF'; p++;
                *p = '\xBF'; p++;
                *p = '\xBD'; p++;
            }
            str += clen;
        }
    }
    *p = '\0';
    *haveToFree = 1;
    return clearedstr;
}

/*---------------------------------------------------------------------------
|   domIsBMPChar 
|
\--------------------------------------------------------------------------*/
int







|











<
>


<
|
|
>
>


<
|
|




|

>
>





<
|
>
>
>
>
>
>
>
>
>
>
>


|
|
<
<
<
|
<
<
<
<
<
<
|
<
<
|
<
<
<
>
|
>
|
|
<
<
<
<
|

>

<
<
<
<
<
|


<
<
|







344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362

363
364
365

366
367
368
369
370
371

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386

387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402



403






404


405



406
407
408
409
410




411
412
413
414





415
416
417


418
419
420
421
422
423
424
425
{
    const char *p;
    int   clen;
    
    p = str;
    while (*p) {
        clen = UTF8_CHAR_LEN(*p);
        if (!clen) return 0;
        if (UTF8_XMLCHAR((unsigned const char *)p,clen))
            p += clen;
        else return 0;
    }
    return 1;
}

/*---------------------------------------------------------------------------
|   domClearString
|
\--------------------------------------------------------------------------*/

void 
domClearString (
    char *str,

    char *replacement,
    domLength repllen,
    Tcl_DString *clearedstr,
    int *changed
    )
{

    char *p, *s;
    int   clen, rewrite = 0;
    
    p = str;
    while (*p) {
        clen = UTF8_CHAR_LEN(*p);
        if (!clen || !UTF8_XMLCHAR((unsigned const char*)p,clen)) {
            rewrite = 1;
            *changed = 1;
            Tcl_DStringInit (clearedstr);
            break;
        }
        p += clen;
    }
    if (!rewrite) {

        return;
    }
    Tcl_DStringAppend (clearedstr, str, p-str);
    if (repllen) {
        Tcl_DStringAppend (clearedstr, replacement, repllen);
    }
    if (clen) {
        p += clen;
    } else {
        /* If it isn't an UTF-8 encoded character what is it? And how
         * many of whatever it is? */
        p++;
    }
    s = p;
    while (*p) {
        clen = UTF8_CHAR_LEN(*p);



        if (!clen || !UTF8_XMLCHAR((unsigned const char*)p,clen)) {






            Tcl_DStringAppend (clearedstr, s, p-s);


            if (repllen) {



                Tcl_DStringAppend (clearedstr, replacement, repllen);
            }
            if (clen) {
                p += clen;
            } else {




                p++;
            }
            s = p;
        } else {





            p += clen;
        }
    }


    Tcl_DStringAppend (clearedstr, s, p-s);
}

/*---------------------------------------------------------------------------
|   domIsBMPChar 
|
\--------------------------------------------------------------------------*/
int

Changes to generic/dom.h.

906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921

void           tcldom_tolower (const char *str, char *str_out, int  len);
int            domIsNAME (const char *name);
int            domIsPINAME (const char *name);
int            domIsQNAME (const char *name);
int            domIsNCNAME (const char *name);
int            domIsChar (const char *str);
char *         domClearString (char *str, int *haveToFree, int replace,
                               domLength length);
int            domIsBMPChar (const char *str);
int            domIsComment (const char *str);
int            domIsCDATA (const char *str);
int            domIsPIValue (const char *str);
void           domCopyTo (domNode *node, domNode *parent, int copyNS);
void           domCopyNS (domNode *from, domNode *to);
domAttrNode *  domCreateXMLNamespaceNode (domNode *parent);







|
|







906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921

void           tcldom_tolower (const char *str, char *str_out, int  len);
int            domIsNAME (const char *name);
int            domIsPINAME (const char *name);
int            domIsQNAME (const char *name);
int            domIsNCNAME (const char *name);
int            domIsChar (const char *str);
void           domClearString (char *str, char *replacement, domLength repllen,
                               Tcl_DString *clearedstr, int *changed);
int            domIsBMPChar (const char *str);
int            domIsComment (const char *str);
int            domIsCDATA (const char *str);
int            domIsPIValue (const char *str);
void           domCopyTo (domNode *node, domNode *parent, int copyNS);
void           domCopyNS (domNode *from, domNode *to);
domAttrNode *  domCreateXMLNamespaceNode (domNode *parent);

Changes to generic/domxslt.c.

6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
                } else {
                    /* definitions for default decimal format */
                    df = xs->decimalFormats;
                }
                str = getAttr(node, "decimal-separator",  a_decimalSeparator);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "decimal-separator has to be a"
                                     " single char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->decimalSeparator);
                }
                str = getAttr(node, "grouping-separator", a_groupingSeparator);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "groupingSeparator has to be a"
                                     " single char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->groupingSeparator);
                }
                str = getAttr(node, "infinity",           a_infinity);
                if (str) df->infinity = str;
                str = getAttr(node, "minus-sign",         a_minusSign);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "minus-sign has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->minusSign);
                }
                str = getAttr(node, "NaN",                a_nan);
                if (str) df->NaN = str;
                str = getAttr(node, "percent",            a_percent);
                if (str) {
                    if (str[1] != '\0') {
                        reportError (node, "percent has to be a single"
                                     " char", errMsg);
                        return -1;
                    }
                    df->percent = str[0];
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "percent has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->percent);                    
                }
                str = getAttr(node, "per-mille",          a_perMille);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "per-mille has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->perMille);                    
                }
                str = getAttr(node, "zero-digit",         a_zeroDigit);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "zero-digit has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->zeroDigit);                    
                }
                str = getAttr(node, "digit",              a_digit);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "digit has to be a single char",
                                     errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->digit);
                }
                str = getAttr(node, "pattern-separator",  a_patternSeparator);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (str[clen] != '\0') {
                        reportError (node, "pattern-separator has to be a"
                                     " single char", errMsg);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->patternSeparator);
                }
                break;







|










|












|


















|










|










|










|










|







6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
                } else {
                    /* definitions for default decimal format */
                    df = xs->decimalFormats;
                }
                str = getAttr(node, "decimal-separator",  a_decimalSeparator);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "decimal-separator has to be a"
                                     " single char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->decimalSeparator);
                }
                str = getAttr(node, "grouping-separator", a_groupingSeparator);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "groupingSeparator has to be a"
                                     " single char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->groupingSeparator);
                }
                str = getAttr(node, "infinity",           a_infinity);
                if (str) df->infinity = str;
                str = getAttr(node, "minus-sign",         a_minusSign);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "minus-sign has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->minusSign);
                }
                str = getAttr(node, "NaN",                a_nan);
                if (str) df->NaN = str;
                str = getAttr(node, "percent",            a_percent);
                if (str) {
                    if (str[1] != '\0') {
                        reportError (node, "percent has to be a single"
                                     " char", errMsg);
                        return -1;
                    }
                    df->percent = str[0];
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "percent has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->percent);                    
                }
                str = getAttr(node, "per-mille",          a_perMille);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "per-mille has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->perMille);                    
                }
                str = getAttr(node, "zero-digit",         a_zeroDigit);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "zero-digit has to be a single"
                                     " char", errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->zeroDigit);                    
                }
                str = getAttr(node, "digit",              a_digit);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "digit has to be a single char",
                                     errMsg);
                        if (newdf) FREE((char*)df);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->digit);
                }
                str = getAttr(node, "pattern-separator",  a_patternSeparator);
                if (str) {
                    clen = UTF8_CHAR_LEN (str[0]);
                    if (!clen || str[clen] != '\0') {
                        reportError (node, "pattern-separator has to be a"
                                     " single char", errMsg);
                        return -1;
                    }
                    Tcl_UtfToUniChar (str, &df->patternSeparator);
                }
                break;

Changes to generic/tcldom.c.

215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    )
    "    createNodeCmd ?-returnNodeCmd? ?-tagName name? ?-jsonType jsonType? ?-namespace URI? (element|comment|text|cdata|pi)Node cmdName \n"
    "    setStoreLineColumn ?boolean?                     \n"
    "    setNameCheck ?boolean?                           \n"
    "    setTextCheck ?boolean?                           \n"
    "    setObjectCommands ?(automatic|token|command)?    \n"
    "    isCharData string                                \n"
    "    clearString string                               \n"
    "    isBMPCharData string                             \n"
    "    isComment string                                 \n"
    "    isCDATA string                                   \n"
    "    isPIValue string                                 \n"
    "    isName string                                    \n"
    "    isQName string                                   \n"
    "    isNCName string                                  \n"







|







215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    )
    "    createNodeCmd ?-returnNodeCmd? ?-tagName name? ?-jsonType jsonType? ?-namespace URI? (element|comment|text|cdata|pi)Node cmdName \n"
    "    setStoreLineColumn ?boolean?                     \n"
    "    setNameCheck ?boolean?                           \n"
    "    setTextCheck ?boolean?                           \n"
    "    setObjectCommands ?(automatic|token|command)?    \n"
    "    isCharData string                                \n"
    "    clearString ?-replace ?replacement?? string      \n"
    "    isBMPCharData string                             \n"
    "    isComment string                                 \n"
    "    isCDATA string                                   \n"
    "    isPIValue string                                 \n"
    "    isName string                                    \n"
    "    isQName string                                   \n"
    "    isNCName string                                  \n"
7826
7827
7828
7829
7830
7831
7832
7833

7834
7835
7836
7837

7838
7839
7840
7841
7842
7843
7844
    Tcl_Interp * interp,
    int          objc,
    Tcl_Obj    * const objv[]
)
{
    GetTcldomDATA;

    char        * method, tmp[300], *clearedStr, *string, *option;

    int           methodIndex, result, i, bool, replace = 0;
    domLength     len;
    Tcl_CmdInfo   cmdInfo;
    Tcl_Obj     * mobjv[MAX_REWRITE_ARGS], *newObj;


    static const char *domMethods[] = {
        "createDocument",  "createDocumentNS",   "createNodeCmd",
        "parse",                                 "setStoreLineColumn",
        "isCharData",      "isName",             "isPIName",
        "isQName",         "isComment",          "isCDATA",
        "isPIValue",       "isNCName",           "createDocumentNode",







|
>
|
|


>







7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
    Tcl_Interp * interp,
    int          objc,
    Tcl_Obj    * const objv[]
)
{
    GetTcldomDATA;

    char        * method, tmp[300], *string, *option,
                 *replacement;
    int           methodIndex, result, i, bool, changed;
    domLength     repllen;
    Tcl_CmdInfo   cmdInfo;
    Tcl_Obj     * mobjv[MAX_REWRITE_ARGS], *newObj;
    Tcl_DString   cleardString;

    static const char *domMethods[] = {
        "createDocument",  "createDocumentNS",   "createNodeCmd",
        "parse",                                 "setStoreLineColumn",
        "isCharData",      "isName",             "isPIName",
        "isQName",         "isComment",          "isCDATA",
        "isPIValue",       "isNCName",           "createDocumentNode",
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108

8109
8110



8111
8112
8113



8114




8115

8116

8117
8118
8119


8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132

        case m_isBMPCharData:
            CheckArgs(3,3,2,"string");
            SetBooleanResult(domIsBMPChar(Tcl_GetString(objv[2])));
            return TCL_OK;

        case m_clearString:
            CheckArgs(3,4,2,"?-replace? string");
            if (objc == 4) {
                option = Tcl_GetString (objv[2]);
                if (option[0] == '-' && option[1] == 'r') {
                    if (Tcl_GetIndexFromObj (interp, objv[2],
                                             clearStringOptions, "option",
                                             0, &i) != TCL_OK) {
                        return TCL_ERROR;
                    }
                } else {
                    SetResult("expected: clearString ?-replace? string");

                    return TCL_ERROR;
                }



                replace = 1;
                objc--;
                objv++;



            }




            string = Tcl_GetStringFromObj (objv[2], &len);

            clearedStr = domClearString (string, &bool, replace, len);

            if (bool) {
                newObj = Tcl_NewStringObj (clearedStr, -1);
                FREE (clearedStr);


                Tcl_SetObjResult (interp, newObj);
            } else {
                Tcl_SetObjResult (interp, objv[2]);
            }
            return TCL_OK;
                
    }
    SetResult( dom_usage);
    return TCL_ERROR;
}

#ifdef TCL_THREADS








|
|








|
>


>
>
>
|
|
|
>
>
>
|
>
>
>
>
|
>
|
>
|
|
|
>
>





<







8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141

8142
8143
8144
8145
8146
8147
8148

        case m_isBMPCharData:
            CheckArgs(3,3,2,"string");
            SetBooleanResult(domIsBMPChar(Tcl_GetString(objv[2])));
            return TCL_OK;

        case m_clearString:
            CheckArgs(3,5,2,"?-replace ?replacement?? string");
            if (objc >= 4) {
                option = Tcl_GetString (objv[2]);
                if (option[0] == '-' && option[1] == 'r') {
                    if (Tcl_GetIndexFromObj (interp, objv[2],
                                             clearStringOptions, "option",
                                             0, &i) != TCL_OK) {
                        return TCL_ERROR;
                    }
                } else {
                    SetResult("expected: clearString ?-replace ?replacement?"
                              " string");
                    return TCL_ERROR;
                }
                objc--;
                objv++;
                if (objc == 4) {
                    replacement = Tcl_GetStringFromObj (objv[2], &repllen);
                    objc--;
                    objv++;
                } else {
                    replacement = "\xEF\xBF\xBD\0";
                    repllen = 3;
                }
            } else {
                replacement = NULL;
                repllen = 0;
            }
            string = Tcl_GetString (objv[2]);
            changed = 0;
            domClearString (string, replacement, repllen, &cleardString,
                            &changed);
            if (changed) {
                newObj = Tcl_NewStringObj (
                    Tcl_DStringValue (&cleardString),
                    Tcl_DStringLength (&cleardString));
                Tcl_DStringFree (&cleardString);
                Tcl_SetObjResult (interp, newObj);
            } else {
                Tcl_SetObjResult (interp, objv[2]);
            }
            return TCL_OK;

    }
    SetResult( dom_usage);
    return TCL_ERROR;
}

#ifdef TCL_THREADS