r3304 jmb - /trunk/netsurf/render/html.c
by netsurf@semichrome.net
Author: jmb
Date: Tue May 29 19:03:07 2007
New Revision: 3304
URL: http://svn.semichrome.net?rev=3D3304&view=3Drev
Log:
Fix bugs in charset detection.
Strip BOM from parser input, as it confuses libxml.
Ignore non-ASCII-compatible charsets declared in meta tag (the parser =
defaults to 8 bit, so if it's managed to extract a meta charset, then it =
must be ASCII-compatible, so a non-ASCII-compatible meta charset is lies).
Fixes WightLink timetable and 1726341.
Modified:
trunk/netsurf/render/html.c
Modified: trunk/netsurf/render/html.c
URL: http://svn.semichrome.net/trunk/netsurf/render/html.c?rev=3D3304&r1=3D=
3303&r2=3D3304&view=3Ddiff
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D
--- trunk/netsurf/render/html.c (original)
+++ trunk/netsurf/render/html.c Tue May 29 19:03:07 2007
@@ -38,7 +38,7 @@
=
=
static bool html_set_parser_encoding(struct content *c, const char *encodi=
ng);
-static const char *html_detect_encoding(const char *data, unsigned int siz=
e);
+static const char *html_detect_encoding(const char **data, unsigned int *s=
ize);
static void html_convert_css_callback(content_msg msg, struct content *css,
intptr_t p1, intptr_t p2, union content_msg_data data);
static bool html_meta_refresh(struct content *c, xmlNode *head);
@@ -157,7 +157,7 @@
* searches for a <meta http-equiv=3D"content-type"
* content=3D"text/html; charset=3D...">. */
const char *encoding;
- encoding =3D html_detect_encoding(data, size);
+ encoding =3D html_detect_encoding((const char **) &data, &size);
if (encoding) {
if (!html_set_parser_encoding(c, encoding))
return false;
@@ -168,6 +168,12 @@
ENCODING_SOURCE_DETECTED;
}
c->data.html.getenc =3D false;
+
+ /* The data we received may have solely consisted of a BOM.
+ * If so, it will have been stripped by html_detect_encoding.
+ * Therefore, we'll have nothing to do in that case. */
+ if (size =3D=3D 0)
+ return true;
}
=
for (x =3D 0; x + CHUNK <=3D size; x +=3D CHUNK) {
@@ -180,8 +186,22 @@
/* The encoding was not in headers or detected,
* and the parser found a <meta http-equiv=3D"content-type"
* content=3D"text/html; charset=3D...">. */
- c->data.html.encoding =3D talloc_strdup(c,
+
+ /* However, if that encoding is non-ASCII-compatible,
+ * ignore it, as it can't possibly be correct */
+ if (strncasecmp(c->data.html.parser->input->encoding,
+ "UTF-16", 6) =3D=3D 0 || /* UTF-16(LE|BE)? */
+ strncasecmp(c->data.html.parser->input->encoding,
+ "UTF-32", 6) =3D=3D 0) { /* UTF-32(LE|BE)? */
+ c->data.html.encoding =3D talloc_strdup(c, "ISO-8859-1");
+ c->data.html.encoding_source =3D
+ ENCODING_SOURCE_DETECTED;
+ } else {
+ c->data.html.encoding =3D talloc_strdup(c,
c->data.html.parser->input->encoding);
+ c->data.html.encoding_source =3D ENCODING_SOURCE_META;
+ }
+
if (!c->data.html.encoding) {
union content_msg_data msg_data;
=
@@ -189,7 +209,6 @@
content_broadcast(c, CONTENT_MSG_ERROR, msg_data);
return false;
}
- c->data.html.encoding_source =3D ENCODING_SOURCE_META;
=
/* have the encoding; don't attempt to detect it */
c->data.html.getenc =3D false;
@@ -293,33 +312,60 @@
/**
* Attempt to detect the encoding of some HTML data.
*
- * \param data HTML source data
- * \param size length of data
+ * \param data Pointer to HTML source data
+ * \param size Pointer to length of data
* \return a constant string giving the encoding, or 0 if the encoding
* appears to be some 8-bit encoding
- */
-
-const char *html_detect_encoding(const char *data, unsigned int size)
-{
+ *
+ * If a BOM is encountered, *data and *size will be modified to skip over =
it
+ */
+
+const char *html_detect_encoding(const char **data, unsigned int *size)
+{
+ const unsigned char *d =3D (const unsigned char *) *data;
+
/* this detection assumes that the first two characters are <=3D 0xff */
- if (size < 4)
+ if (*size < 4)
return 0;
- if (data[0] =3D=3D 0xfe && data[1] =3D=3D 0xff) /* BOM fe ff=
*/
+
+ if (d[0] =3D=3D 0x00 && d[1] =3D=3D 0x00 &&
+ d[2] =3D=3D 0xfe && d[3] =3D=3D 0xff) { /* BOM 00 00 fe ff */
+ *data +=3D 4;
+ *size -=3D 4;
+ return "UTF-32BE";
+ } else if (d[0] =3D=3D 0xff && d[1] =3D=3D 0xfe &&
+ d[2] =3D=3D 0x00 && d[3] =3D=3D 0x00) { /* BOM ff fe 00 00 */
+ *data +=3D 4;
+ *size -=3D 4;
+ return "UTF-32LE";
+ }
+ else if (d[0] =3D=3D 0x00 && d[1] !=3D 0x00 &&
+ d[2] =3D=3D 0x00 && d[3] !=3D 0x00) /* 00 xx 00 xx */
return "UTF-16BE";
- else if (data[0] =3D=3D 0xfe && data[1] =3D=3D 0xff) /* BOM ff fe=
*/
+ else if (d[0] !=3D 0x00 && d[1] =3D=3D 0x00 &&
+ d[2] !=3D 0x00 && d[3] =3D=3D 0x00) /* xx 00 xx 00 */
return "UTF-16LE";
- else if (data[0] =3D=3D 0x00 && data[1] !=3D 0x00 &&
- data[2] =3D=3D 0x00 && data[3] !=3D 0x00) /* 00 xx 00 xx */
+ else if (d[0] =3D=3D 0x00 && d[1] =3D=3D 0x00 &&
+ d[2] =3D=3D 0x00 && d[3] !=3D 0x00) /* 00 00 00 xx */
+ return "ISO-10646-UCS-4";
+ else if (d[0] !=3D 0x00 && d[1] =3D=3D 0x00 &&
+ d[2] =3D=3D 0x00 && d[3] =3D=3D 0x00) /* xx 00 00 00 */
+ return "ISO-10646-UCS-4";
+ else if (d[0] =3D=3D 0xfe && d[1] =3D=3D 0xff) { /* BOM fe ff */
+ *data +=3D 2;
+ *size -=3D 2;
return "UTF-16BE";
- else if (data[0] !=3D 0x00 && data[1] =3D=3D 0x00 &&
- data[2] !=3D 0x00 && data[3] =3D=3D 0x00) /* xx 00 xx 00 */
- return "UTF-16BE";
- else if (data[0] =3D=3D 0x00 && data[1] =3D=3D 0x00 &&
- data[2] =3D=3D 0x00 && data[3] !=3D 0x00) /* 00 00 00 xx */
- return "ISO-10646-UCS-4";
- else if (data[0] !=3D 0x00 && data[1] =3D=3D 0x00 &&
- data[2] =3D=3D 0x00 && data[3] =3D=3D 0x00) /* xx 00 00 00 */
- return "ISO-10646-UCS-4";
+ } else if (d[0] =3D=3D 0xfe && d[1] =3D=3D 0xff) { /* BOM ff fe */
+ *data +=3D 2;
+ *size -=3D 2;
+ return "UTF-16LE";
+ } else if (d[0] =3D=3D 0xef && d[1] =3D=3D 0xbb &&
+ d[2] =3D=3D 0xbf) { /* BOM ef bb bf */
+ *data +=3D 3;
+ *size -=3D 3;
+ return "UTF-8";
+ }
+
return 0;
}
=