Index: generic/domhtml.c ================================================================== --- generic/domhtml.c +++ generic/domhtml.c @@ -2303,12 +2303,17 @@ /*---------------------------------------------------------------------------- | TranslateEntityRefs -- | -| Translate entity references and character references in the string -| "z". "z" is overwritten with the translated sequence. +| Translate entity references and character references in the +| nodeValue of the domTextNode or domAttrNode given as +| argument. Since all character references and almost all +| entity references are longer or equal in length as the +| referenced UTF-8 byte sequence the translation is basically +| done by rewriting the nodeValue in place and with special +| handling of the few conterexamples. | | Unrecognized entity references are unaltered. | | Example: | @@ -2315,21 +2320,29 @@ | input = "AT&T > MCI" | output = "AT&T > MCI" | \---------------------------------------------------------------------------*/ static void TranslateEntityRefs ( - char *z, - domLength *newLen + domTextNode *textOrAtt ) { + char *z; /* Pointer to nodeValue to rewrite */ int from; /* Read characters from this position in z[] */ int to; /* Write characters into this position in z[] */ int h; /* A hash on the entity reference */ char *zVal; /* The substituted value */ Er *p; /* For looping down the entity reference collision chain */ - int value; - + int value, zlen, overlen; + char *ole, *newNodeValue; + + if (textOrAtt->nodeType == ATTRIBUTE_NODE) { + z = ((domAttrNode*)textOrAtt)->nodeValue; + zlen = ((domAttrNode*)textOrAtt)->valueLength; + } else { + z = textOrAtt->nodeValue; + zlen = textOrAtt->valueLength; + } from = to = 0; if (bErNeedsInit) { TDomThreaded(Tcl_MutexLock(&initMutex);) if (bErNeedsInit) { @@ -2416,11 +2429,14 @@ } else if (value <= 0xFFFF) { z[to++] = (char) ((value >> 12) | 0xE0); z[to++] = (char) (((value >> 6) | 0x80) & 0xBF); z[to++] = (char) ((value | 0x80) & 0xBF); } else { - /* error */ + z[to++] = (char) ((value >> 18) | 0xf0); + z[to++] = (char) (((value >> 12) & 0x3f) | 0x80); + z[to++] = (char) (((value >> 6) & 0x3f) | 0x80); + z[to++] = (char) ((value & 0x3f) | 0x80); } from = i+1; } } else { while (z[i] && isalnum((unsigned char)z[i])) { @@ -2430,10 +2446,23 @@ z[i] = 0; h = ErHash(&z[from+1]); p = apErHash[h]; while (p && strcmp(p->zName,&z[from+1])!=0 ) { p = p->pNext; + } + ole = NULL; + if (!p && c == ';') { + /* Entity name not found. It may be one of the few + * entities with a referenced UTF-8 byte sequence + * which is longer than the reference. */ + if (strcmp("nGt",&z[from+1]) == 0) { + ole = "\xE2\x89\xAB\xE2\x83\x92\x00"; + overlen = 1; + } else if (strcmp("nLt",&z[from+1]) == 0) { + ole = "\xE2\x89\xAA\xE2\x83\x92\x00"; + overlen = 1; + } } z[i] = c; if (p) { zVal = p->zValue; while (*zVal) { @@ -2440,19 +2469,47 @@ z[to++] = *(zVal++); } from = i; if (c==';') from++; } else { - z[to++] = z[from++]; + if (ole) { + /* Over-long entity reference */ + from = i; + newNodeValue = MALLOC(zlen + 1 + overlen); + memmove(newNodeValue,z,to); + while (*ole) { + newNodeValue[to++] = *(ole++); + } + memmove(newNodeValue + to, z + from + 1 , + zlen - from); + z = newNodeValue; + zlen = zlen + overlen; + if (textOrAtt->nodeType == ATTRIBUTE_NODE) { + FREE (((domAttrNode*)textOrAtt)->nodeValue); + ((domAttrNode*)textOrAtt)->nodeValue = z; + ((domAttrNode*)textOrAtt)->valueLength = zlen; + } else { + FREE (textOrAtt->nodeValue); + textOrAtt->nodeValue = z; + textOrAtt->valueLength = zlen; + } + from = to; + } else { + z[to++] = z[from++]; + } } } } else { z[to++] = z[from++]; } } z[to] = 0; - *newLen = to; + if (textOrAtt->nodeType == ATTRIBUTE_NODE) { + ((domAttrNode*)textOrAtt)->valueLength = to; + } else { + textOrAtt->valueLength = to; + } } /*---------------------------------------------------------------------------- | End Of Character Entity Translator \---------------------------------------------------------------------------*/ @@ -2541,11 +2598,11 @@ tnode->nodeValue = (char*)MALLOC((x - start)+1); memmove(tnode->nodeValue, start, (x - start)); *(tnode->nodeValue + (x - start)) = 0; DBG(fprintf(stderr, "New text node: '%s'\n", tnode->nodeValue);) if (ampersandSeen) { - TranslateEntityRefs(tnode->nodeValue, &(tnode->valueLength) ); + TranslateEntityRefs (tnode); } tnode->parentNode = parent_node; if (parent_node->firstChild) { parent_node->lastChild->nextSibling = (domNode*)tnode; tnode->previousSibling = parent_node->lastChild; @@ -3093,12 +3150,11 @@ attrnode->nodeValue = (char*)MALLOC(nArgVal+1); attrnode->valueLength = nArgVal; memmove(attrnode->nodeValue, ArgVal, nArgVal); *(attrnode->nodeValue + nArgVal) = 0; if (ampersandSeen) { - TranslateEntityRefs(attrnode->nodeValue, - &(attrnode->valueLength) ); + TranslateEntityRefs ((domTextNode*)attrnode); } if (!strcmp(ArgName, "id")) { if (!doc->ids) { doc->ids = (Tcl_HashTable *) MALLOC (sizeof (Tcl_HashTable)); Index: tests/htmlreader.test ================================================================== --- tests/htmlreader.test +++ tests/htmlreader.test @@ -75,11 +75,11 @@ set s [encoding convertto utf-8 $c] binary scan $s H* x regsub -all -expanded {..} $x {\x&} } -test html-1.8 {character entities} { +test html-1.8 {character entities} {Tcl9} { set result "" foreach {entity byteseq} { "AElig" "\\xC3\\x86" "AMP" "\\x26" "Aacute" "\\xC3\\x81" @@ -2214,10 +2214,57 @@ $doc delete } set result } "" +test html-1.9 {non-existing character entities} { + set doc [dom parse -html {&abcdef;}] + set root [$doc documentElement] + set result [$root text] + $doc delete + set result +} "&abcdef;" + +test html-1.10 {non-BMP character reference} {Tcl9} { + set doc [dom parse -html {𝒞𝒞}] + set root [$doc documentElement] + set result [$root text] + $doc delete + set result +} "\U1d49e\U1d49e" + +test html-1.11 {Prematur end of entity reference with overlong repacement} {Tcl9} { + set doc [dom parse -html {&nGt}] + set root [$doc documentElement] + set result [$root text] + $doc delete + set result +} "&nGt" + +test html-1.12 {Entity reference with overlong repacement} {Tcl9} { + set doc [dom parse -html {≫⃒}] + set root [$doc documentElement] + set result [$root text] + $doc delete + set result +} "\u226B\u20D2" + +test html-1.13 {Entity reference with overlong repacement} {Tcl9} { + set doc [dom parse -html {≫⃒abc≫⃒∡≪⃒≫⃒foobar}] + set root [$doc documentElement] + set result [$root text] + $doc delete + set result +} "\u226B\u20D2abc\u226B\u20D2\u2221\u226A\u20D2\u226B\u20D2foobar" + +test html-1.14 {Entity reference with overlong repacement} {Tcl9} { + set doc [dom parse -html {≫⃒abc}] + set root [$doc documentElement] + set result [$root text] + $doc delete + set result +} "\u226B\u20D2abc" test html-2.1 {not closed p tags} { set doc [dom parse -html {

Para 1

Para 2

Para 3 }]