khanat-code-old/code/nel/src/gui/group_html_parser.cpp

// Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
// Copyright (C) 2010  Winch Gate Property Limited
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.


#include "stdpch.h"

#include <string>
#include <libxml/HTMLparser.h>

#include "nel/misc/types_nl.h"
#include "nel/gui/libwww.h"
#include "nel/gui/group_html.h"
#include "nel/gui/lua_ihm.h"

using namespace std;
using namespace NLMISC;

namespace NLGUI
{
	// ***************************************************************************
	void CGroupHTML::htmlElement(xmlNode *node, int element_number)
	{
		SGML_dtd *HTML_DTD = HTML_dtd ();

		if (element_number < HTML_ELEMENTS)
		{
			CXMLAutoPtr ptr;
			// load attributes into libwww structs
			std::vector<bool> present;
			std::vector<const char *>value;
			std::string strvalues[MAX_ATTRIBUTES];
			present.resize(30, false);
			value.resize(30);

			uint nbAttributes = std::min(MAX_ATTRIBUTES, HTML_DTD->tags[element_number].number_of_attributes);
			for(uint i=0; i<nbAttributes; i++)
			{
				std::string name;
				name = toLower(std::string(HTML_DTD->tags[element_number].attributes[i].name));
				ptr = xmlGetProp(node, (const xmlChar *)name.c_str());
				if (ptr)
				{
					// copy xmlChar to string (xmlChar will be released)
					strvalues[i] = (const char *)(ptr);
					// now use string pointer in value[] array
					value[i] = strvalues[i].c_str();
					present[i] = true;
				}
			}

			if (element_number == HTML_A)
			{
				addLink(element_number, present, value);
			}

			beginElement(element_number, present, value);
		}
		else
		{
			beginUnparsedElement((const char *)(node->name), xmlStrlen(node->name));
		}

		// recursive - text content / child nodes
		htmlWalkDOM(node->children);

		// closing tag
		if (element_number < HTML_ELEMENTS)
		{
			endElement(element_number);
		}
		else
		{
			endUnparsedElement((const char *)(node->name), xmlStrlen(node->name));
		}
	}

	// ***************************************************************************
	// recursive function to walk html document
	void CGroupHTML::htmlWalkDOM(xmlNode *a_node)
	{
		SGML_dtd *HTML_DTD = HTML_dtd ();

		uint element_number;
		xmlNode *node = a_node;
		while(node)
		{
			if (node->type == XML_TEXT_NODE)
			{
				addText((const char *)(node->content), xmlStrlen(node->content));
			}
			else
			if (node->type == XML_ELEMENT_NODE)
			{
				// find libwww tag
				for(element_number = 0; element_number<HTML_ELEMENTS; ++element_number)
				{
					if (xmlStrncasecmp(node->name, (const xmlChar *)HTML_DTD->tags[element_number].name.c_str(), xmlStrlen(node->name)) == 0)
						break;
				}

				htmlElement(node, element_number);
			}

			// move into next sibling
			node = node->next;
		}
	}

	// ***************************************************************************
	// http://stackoverflow.com/a/18335183
	static std::string correct_non_utf_8(const std::string &str)
	{
		int i,f_size=str.size();
		unsigned char c,c2,c3,c4;
		std::string to;
		to.reserve(f_size);

		for(i=0 ; i<f_size ; i++){
			c=(unsigned char)(str[i]);
			if(c<32){//control char
				if(c==9 || c==10 || c==13){//allow only \t \n \r
					to.append(1,c);
				}
				continue;
			}else if(c<127){//normal ASCII
				to.append(1,c);
				continue;
			}else if(c<160){//control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)
				if(c==128){//fix microsoft mess, add euro
					to.append(1,226);
					to.append(1,130);
					to.append(1,172);
				}
				if(c==133){//fix IBM mess, add NEL = \n\r
					to.append(1,10);
					to.append(1,13);
				}
				continue;
			}else if(c<192){//invalid for UTF8, converting ASCII
				to.append(1,(unsigned char)194);
				to.append(1,c);
				continue;
			}else if(c<194){//invalid for UTF8, converting ASCII
				to.append(1,(unsigned char)195);
				to.append(1,c-64);
				continue;
			}else if(c<224 && i+1<f_size){//possibly 2byte UTF8
				c2=(unsigned char)(str[i+1]);
				if(c2>127 && c2<192){//valid 2byte UTF8
					if(c==194 && c2<160){//control char, skipping
						;
					}else{
						to.append(1,c);
						to.append(1,c2);
					}
					i++;
					continue;
				}
			}else if(c<240 && i+2<f_size){//possibly 3byte UTF8
				c2=(unsigned char)(str[i+1]);
				c3=(unsigned char)(str[i+2]);
				if(c2>127 && c2<192 && c3>127 && c3<192){//valid 3byte UTF8
					to.append(1,c);
					to.append(1,c2);
					to.append(1,c3);
					i+=2;
					continue;
				}
			}else if(c<245 && i+3<f_size){//possibly 4byte UTF8
				c2=(unsigned char)(str[i+1]);
				c3=(unsigned char)(str[i+2]);
				c4=(unsigned char)(str[i+3]);
				if(c2>127 && c2<192 && c3>127 && c3<192 && c4>127 && c4<192){//valid 4byte UTF8
					to.append(1,c);
					to.append(1,c2);
					to.append(1,c3);
					to.append(1,c4);
					i+=3;
					continue;
				}
			}
			//invalid UTF8, converting ASCII (c>245 || string too short for multi-byte))
			to.append(1,(unsigned char)195);
			to.append(1,c-64);
		}
		return to;
	}

	// ***************************************************************************
	static void patchHtmlQuirks(std::string &htmlString)
	{
		size_t npos = std::string::npos;
		size_t pos;

		// get rid of BOM (some ingame help files does not show up otherwise)
		if (htmlString.substr(0, 3) == "\xEF\xBB\xBF")
		{
			htmlString.erase(0, 3);
		}

		// if any element is before <html>, then parser adds <html><body>
		// and original tags are ignored (their attributes not processed)
		//
		// only fix situation when there is <body> tag with attributes
		//
		// tags are considered to be lowercase

		pos = htmlString.find("<body ");
		if (pos != npos)
		{
			size_t start = htmlString.find("<");
			// skip <!doctype html>
			if (htmlString.substr(start, 2) == "<!")
				start = htmlString.find("<", start + 1);

			// if there is no html tag, then abort
			size_t end = htmlString.find("<html>");
			if (end != npos && start < end && end < pos)
			{
				// body tag end position
				size_t insert = htmlString.find(">", pos);
				if (insert != npos)
				{
					std::string str = htmlString.substr(start, end - start);
					htmlString.insert(insert+1, str);
					htmlString.erase(start, str.size());
				}
			}
		}

		// make sure </html> (if present) is last in document or tags coming after it are ignored
		pos = htmlString.find("</html>");
		if (pos != npos && htmlString.find("<", pos+1) > pos)
		{
			htmlString.erase(pos, 7);
			htmlString += "</html>";
		}

		// if there is invalid utf-8 chars, then libxml will break everything after first it finds.
		htmlString = correct_non_utf_8(htmlString);
	}

	// ***************************************************************************
	bool CGroupHTML::parseHtml(std::string htmlString)
	{
		htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);
		if (!parser)
		{
			nlwarning("Creating html parser context failed");
			return false;
		}

		htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);

		// parser is little strict on tag order, so fix whats needed
		patchHtmlQuirks(htmlString);

		htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0);
		htmlParseChunk(parser, "", 0, 1);

		bool success = true;
		if (parser->myDoc)
		{
			xmlNode *root = xmlDocGetRootElement(parser->myDoc);
			if (root)
			{
				htmlWalkDOM(root);
			}
			else
			{
				nlwarning("html root node failed");
				success = false;
			}
		}
		else
		{
			nlwarning("htmlstring parsing failed");
			success = false;
		}

		htmlFreeParserCtxt(parser);
		return success;
	}

	// ***************************************************************************
	int CGroupHTML::luaParseHtml(CLuaState &ls)
	{
		const char *funcName = "parseHtml";
		CLuaIHM::checkArgCount(ls, funcName, 1);
		CLuaIHM::checkArgType(ls, funcName, 1, LUA_TSTRING);
		std::string html = ls.toString(1);

		parseHtml(html);

		return 0;
	}

}
Implement html parser using libxml2 2015-04-13 17:52:34 +00:00			`// Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>`
			`// Copyright (C) 2010 Winch Gate Property Limited`
			`//`
			`// This program is free software: you can redistribute it and/or modify`
			`// it under the terms of the GNU Affero General Public License as`
			`// published by the Free Software Foundation, either version 3 of the`
			`// License, or (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU Affero General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU Affero General Public License`
			`// along with this program. If not, see <http://www.gnu.org/licenses/>.`


			`#include "stdpch.h"`

			`#include <string>`
			`#include <libxml/HTMLparser.h>`

			`#include "nel/misc/types_nl.h"`
			`#include "nel/gui/libwww.h"`
			`#include "nel/gui/group_html.h"`
			`#include "nel/gui/lua_ihm.h"`

			`using namespace std;`
			`using namespace NLMISC;`

			`namespace NLGUI`
			`{`
			`// ***************************************************************************`
			`void CGroupHTML::htmlElement(xmlNode *node, int element_number)`
			`{`
			`SGML_dtd *HTML_DTD = HTML_dtd ();`

			`if (element_number < HTML_ELEMENTS)`
			`{`
			`CXMLAutoPtr ptr;`
			`// load attributes into libwww structs`
Fix compiling under windows 2015-04-19 22:04:52 +00:00			`std::vector<bool> present;`
			`std::vector<const char *>value;`
Implement html parser using libxml2 2015-04-13 17:52:34 +00:00			`std::string strvalues[MAX_ATTRIBUTES];`
Fix compiling under windows 2015-04-19 22:04:52 +00:00			`present.resize(30, false);`
			`value.resize(30);`
Implement html parser using libxml2 2015-04-13 17:52:34 +00:00
			`uint nbAttributes = std::min(MAX_ATTRIBUTES, HTML_DTD->tags[element_number].number_of_attributes);`
			`for(uint i=0; i<nbAttributes; i++)`
			`{`
			`std::string name;`
			`name = toLower(std::string(HTML_DTD->tags[element_number].attributes[i].name));`
			`ptr = xmlGetProp(node, (const xmlChar *)name.c_str());`
			`if (ptr)`
			`{`
			`// copy xmlChar to string (xmlChar will be released)`
			`strvalues[i] = (const char *)(ptr);`
			`// now use string pointer in value[] array`
			`value[i] = strvalues[i].c_str();`
			`present[i] = true;`
			`}`
			`}`

			`if (element_number == HTML_A)`
			`{`
			`addLink(element_number, present, value);`
			`}`

			`beginElement(element_number, present, value);`
			`}`
			`else`
			`{`
			`beginUnparsedElement((const char *)(node->name), xmlStrlen(node->name));`
			`}`

			`// recursive - text content / child nodes`
			`htmlWalkDOM(node->children);`

			`// closing tag`
			`if (element_number < HTML_ELEMENTS)`
			`{`
			`endElement(element_number);`
			`}`
			`else`
			`{`
			`endUnparsedElement((const char *)(node->name), xmlStrlen(node->name));`
			`}`
			`}`

			`// ***************************************************************************`
			`// recursive function to walk html document`
			`void CGroupHTML::htmlWalkDOM(xmlNode *a_node)`
			`{`
			`SGML_dtd *HTML_DTD = HTML_dtd ();`

			`uint element_number;`
			`xmlNode *node = a_node;`
			`while(node)`
			`{`
			`if (node->type == XML_TEXT_NODE)`
			`{`
			`addText((const char *)(node->content), xmlStrlen(node->content));`
			`}`
			`else`
			`if (node->type == XML_ELEMENT_NODE)`
			`{`
			`// find libwww tag`
			`for(element_number = 0; element_number<HTML_ELEMENTS; ++element_number)`
			`{`
Remove libwww, only keep html entitites list 2015-04-17 12:10:00 +00:00			`if (xmlStrncasecmp(node->name, (const xmlChar *)HTML_DTD->tags[element_number].name.c_str(), xmlStrlen(node->name)) == 0)`
Implement html parser using libxml2 2015-04-13 17:52:34 +00:00			`break;`
			`}`

			`htmlElement(node, element_number);`
			`}`

			`// move into next sibling`
			`node = node->next;`
			`}`
			`}`

Ensure that html is valid utf8 and that tags come in correct(ish) order 2015-04-18 20:23:23 +00:00			`// ***************************************************************************`
			`// http://stackoverflow.com/a/18335183`
			`static std::string correct_non_utf_8(const std::string &str)`
			`{`
			`int i,f_size=str.size();`
			`unsigned char c,c2,c3,c4;`
			`std::string to;`
			`to.reserve(f_size);`

			`for(i=0 ; i<f_size ; i++){`
			`c=(unsigned char)(str[i]);`
			`if(c<32){//control char`
			`if(c==9 \|\| c==10 \|\| c==13){//allow only \t \n \r`
			`to.append(1,c);`
			`}`
			`continue;`
			`}else if(c<127){//normal ASCII`
			`to.append(1,c);`
			`continue;`
			`}else if(c<160){//control char (nothing should be defined here either ASCI, ISO_8859-1 or UTF8, so skipping)`
			`if(c==128){//fix microsoft mess, add euro`
			`to.append(1,226);`
			`to.append(1,130);`
			`to.append(1,172);`
			`}`
			`if(c==133){//fix IBM mess, add NEL = \n\r`
			`to.append(1,10);`
			`to.append(1,13);`
			`}`
			`continue;`
			`}else if(c<192){//invalid for UTF8, converting ASCII`
			`to.append(1,(unsigned char)194);`
			`to.append(1,c);`
			`continue;`
			`}else if(c<194){//invalid for UTF8, converting ASCII`
			`to.append(1,(unsigned char)195);`
			`to.append(1,c-64);`
			`continue;`
			`}else if(c<224 && i+1<f_size){//possibly 2byte UTF8`
			`c2=(unsigned char)(str[i+1]);`
			`if(c2>127 && c2<192){//valid 2byte UTF8`
			`if(c==194 && c2<160){//control char, skipping`
			`;`
			`}else{`
			`to.append(1,c);`
			`to.append(1,c2);`
			`}`
			`i++;`
			`continue;`
			`}`
			`}else if(c<240 && i+2<f_size){//possibly 3byte UTF8`
			`c2=(unsigned char)(str[i+1]);`
			`c3=(unsigned char)(str[i+2]);`
			`if(c2>127 && c2<192 && c3>127 && c3<192){//valid 3byte UTF8`
			`to.append(1,c);`
			`to.append(1,c2);`
			`to.append(1,c3);`
			`i+=2;`
			`continue;`
			`}`
			`}else if(c<245 && i+3<f_size){//possibly 4byte UTF8`
			`c2=(unsigned char)(str[i+1]);`
			`c3=(unsigned char)(str[i+2]);`
			`c4=(unsigned char)(str[i+3]);`
			`if(c2>127 && c2<192 && c3>127 && c3<192 && c4>127 && c4<192){//valid 4byte UTF8`
			`to.append(1,c);`
			`to.append(1,c2);`
			`to.append(1,c3);`
			`to.append(1,c4);`
			`i+=3;`
			`continue;`
			`}`
			`}`
			`//invalid UTF8, converting ASCII (c>245 \|\| string too short for multi-byte))`
			`to.append(1,(unsigned char)195);`
			`to.append(1,c-64);`
			`}`
			`return to;`
			`}`

			`// ***************************************************************************`
			`static void patchHtmlQuirks(std::string &htmlString)`
			`{`
			`size_t npos = std::string::npos;`
			`size_t pos;`

			`// get rid of BOM (some ingame help files does not show up otherwise)`
			`if (htmlString.substr(0, 3) == "\xEF\xBB\xBF")`
			`{`
			`htmlString.erase(0, 3);`
			`}`

			`// if any element is before <html>, then parser adds <html><body>`
			`// and original tags are ignored (their attributes not processed)`
			`//`
			`// only fix situation when there is <body> tag with attributes`
			`//`
			`// tags are considered to be lowercase`

			`pos = htmlString.find("<body ");`
			`if (pos != npos)`
			`{`
			`size_t start = htmlString.find("<");`
			`// skip <!doctype html>`
			`if (htmlString.substr(start, 2) == "<!")`
			`start = htmlString.find("<", start + 1);`

			`// if there is no html tag, then abort`
			`size_t end = htmlString.find("<html>");`
			`if (end != npos && start < end && end < pos)`
			`{`
			`// body tag end position`
			`size_t insert = htmlString.find(">", pos);`
			`if (insert != npos)`
			`{`
			`std::string str = htmlString.substr(start, end - start);`
			`htmlString.insert(insert+1, str);`
			`htmlString.erase(start, str.size());`
			`}`
			`}`
			`}`

			`// make sure </html> (if present) is last in document or tags coming after it are ignored`
			`pos = htmlString.find("</html>");`
			`if (pos != npos && htmlString.find("<", pos+1) > pos)`
			`{`
			`htmlString.erase(pos, 7);`
			`htmlString += "</html>";`
			`}`

			`// if there is invalid utf-8 chars, then libxml will break everything after first it finds.`
			`htmlString = correct_non_utf_8(htmlString);`
			`}`

Implement html parser using libxml2 2015-04-13 17:52:34 +00:00			`// ***************************************************************************`
			`bool CGroupHTML::parseHtml(std::string htmlString)`
			`{`
Ensure that html is valid utf8 and that tags come in correct(ish) order 2015-04-18 20:23:23 +00:00			`htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);`
Implement html parser using libxml2 2015-04-13 17:52:34 +00:00			`if (!parser)`
			`{`
			`nlwarning("Creating html parser context failed");`
			`return false;`
			`}`

			`htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING \| HTML_PARSE_NONET);`

Ensure that html is valid utf8 and that tags come in correct(ish) order 2015-04-18 20:23:23 +00:00			`// parser is little strict on tag order, so fix whats needed`
			`patchHtmlQuirks(htmlString);`

Implement html parser using libxml2 2015-04-13 17:52:34 +00:00			`htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0);`
			`htmlParseChunk(parser, "", 0, 1);`

			`bool success = true;`
			`if (parser->myDoc)`
			`{`
			`xmlNode *root = xmlDocGetRootElement(parser->myDoc);`
			`if (root)`
			`{`
			`htmlWalkDOM(root);`
			`}`
			`else`
			`{`
			`nlwarning("html root node failed");`
			`success = false;`
			`}`
			`}`
			`else`
			`{`
			`nlwarning("htmlstring parsing failed");`
			`success = false;`
			`}`

			`htmlFreeParserCtxt(parser);`
			`return success;`
			`}`

			`// ***************************************************************************`
			`int CGroupHTML::luaParseHtml(CLuaState &ls)`
			`{`
			`const char *funcName = "parseHtml";`
			`CLuaIHM::checkArgCount(ls, funcName, 1);`
			`CLuaIHM::checkArgType(ls, funcName, 1, LUA_TSTRING);`
			`std::string html = ls.toString(1);`

			`parseHtml(html);`

			`return 0;`
			`}`

			`}`