kerozcak 4d7293d0f5 Changed: #1315 Adding work i have done so far.
branch : gsoc2011-kerozcak
2011-06-09 18:23:29 +02:00

382 lines
14 KiB

* base include file for SimpleTest
* @package SimpleTest
* @subpackage WebTester
* @version $Id: php_parser.php 1911 2009-07-29 16:38:04Z lastcraft $
* Builds the page object.
* @package SimpleTest
* @subpackage WebTester
class SimpleTidyPageBuilder {
private $page;
private $forms = array();
private $labels = array();
private $widgets_by_id = array();
public function __destruct() {
* Frees up any references so as to allow the PHP garbage
* collection from unset() to work.
private function free() {
$this->forms = array();
$this->labels = array();
* This builder is only available if the 'tidy' extension is loaded.
* @return boolean True if available.
function can() {
return extension_loaded('tidy');
* Reads the raw content the page using HTML Tidy.
* @param $response SimpleHttpResponse Fetched response.
* @return SimplePage Newly parsed page.
function parse($response) {
$this->page = new SimplePage($response);
$tidied = tidy_parse_string($input = $this->insertGuards($response->getContent()),
array('output-xml' => false, 'wrap' => '0', 'indent' => 'no'),
$this->attachLabels($this->widgets_by_id, $this->labels);
$page = $this->page;
return $page;
* Stops HTMLTidy stripping content that we wish to preserve.
* @param string The raw html.
* @return string The html with guard tags inserted.
private function insertGuards($html) {
return $this->insertEmptyTagGuards($this->insertTextareaSimpleWhitespaceGuards($html));
* Removes the extra content added during the parse stage
* in order to preserve content we don't want stripped
* out by HTMLTidy.
* @param string The raw html.
* @return string The html with guard tags removed.
private function stripGuards($html) {
return $this->stripTextareaWhitespaceGuards($this->stripEmptyTagGuards($html));
* HTML tidy strips out empty tags such as <option> which we
* need to preserve. This method inserts an additional marker.
* @param string The raw html.
* @return string The html with guards inserted.
private function insertEmptyTagGuards($html) {
return preg_replace('#<(option|textarea)([^>]*)>(\s*)</(option|textarea)>#is',
* HTML tidy strips out empty tags such as <option> which we
* need to preserve. This method strips additional markers
* inserted by SimpleTest to the tidy output used to make the
* tags non-empty. This ensures their preservation.
* @param string The raw html.
* @return string The html with guards removed.
private function stripEmptyTagGuards($html) {
return preg_replace('#(^|>)(\s*)___EMPTY___(\s*)(</|$)#i', '\2\3', $html);
* By parsing the XML output of tidy, we lose some whitespace
* information in textarea tags. We temporarily recode this
* data ourselves so as not to lose it.
* @param string The raw html.
* @return string The html with guards inserted.
private function insertTextareaSimpleWhitespaceGuards($html) {
return preg_replace_callback('#<textarea([^>]*)>(.*?)</textarea>#is',
array($this, 'insertWhitespaceGuards'),
* Callback for insertTextareaSimpleWhitespaceGuards().
* @param array $matches Result of preg_replace_callback().
* @return string Guard tags now replace whitespace.
private function insertWhitespaceGuards($matches) {
return '<textarea' . $matches[1] . '>' .
str_replace(array("\n", "\r", "\t", ' '),
array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
$matches[2]) .
* Removes the whitespace preserving guards we added
* before parsing.
* @param string The raw html.
* @return string The html with guards removed.
private function stripTextareaWhitespaceGuards($html) {
return str_replace(array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
array("\n", "\r", "\t", ' '),
* Visits the given node and all children
* @param object $node Tidy XML node.
private function walkTree($node) {
if ($node->name == 'a') {
$this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
} elseif ($node->name == 'base' and isset($node->attribute['href'])) {
} elseif ($node->name == 'title') {
$this->page->setTitle($this->tags()->createTag($node->name, (array)$node->attribute)
} elseif ($node->name == 'frameset') {
} elseif ($node->name == 'form') {
$this->forms[] = $this->walkForm($node, $this->createEmptyForm($node));
} elseif ($node->name == 'label') {
$this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
} else {
* Helper method for traversing the XML tree.
* @param object $node Tidy XML node.
private function walkChildren($node) {
if ($node->hasChildren()) {
foreach ($node->child as $child) {
* Facade for forms containing preparsed widgets.
* @param object $node Tidy XML node.
* @return SimpleForm Facade for SimpleBrowser.
private function createEmptyForm($node) {
return new SimpleForm($this->tags()->createTag($node->name, (array)$node->attribute), $this->page);
* Visits the given node and all children
* @param object $node Tidy XML node.
private function walkForm($node, $form, $enclosing_label = '') {
if ($node->name == 'a') {
$this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
} elseif (in_array($node->name, array('input', 'button', 'textarea', 'select'))) {
$this->addWidgetToForm($node, $form, $enclosing_label);
} elseif ($node->name == 'label') {
$this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
if ($node->hasChildren()) {
foreach ($node->child as $child) {
$this->walkForm($child, $form, SimplePage::normalise($this->innerHtml($node)));
} elseif ($node->hasChildren()) {
foreach ($node->child as $child) {
$this->walkForm($child, $form);
return $form;
* Tests a node for a "for" atribute. Used for
* attaching labels.
* @param object $node Tidy XML node.
* @return boolean True if the "for" attribute exists.
private function hasFor($node) {
return isset($node->attribute) and $node->attribute['for'];
* Adds the widget into the form container.
* @param object $node Tidy XML node of widget.
* @param SimpleForm $form Form to add it to.
* @param string $enclosing_label The label of any label
* tag we might be in.
private function addWidgetToForm($node, $form, $enclosing_label) {
$widget = $this->tags()->createTag($node->name, $this->attributes($node));
if (! $widget) {
if ($node->name == 'select') {
* Fills the widget cache to speed up searching.
* @param SimpleTag $widget Parsed widget to cache.
private function indexWidgetById($widget) {
$id = $widget->getAttribute('id');
if (! $id) {
if (! isset($this->widgets_by_id[$id])) {
$this->widgets_by_id[$id] = array();
$this->widgets_by_id[$id][] = $widget;
* Parses the options from inside an XML select node.
* @param object $node Tidy XML node.
* @return array List of SimpleTag options.
private function collectSelectOptions($node) {
$options = array();
if ($node->name == 'option') {
$options[] = $this->tags()->createTag($node->name, $this->attributes($node))
if ($node->hasChildren()) {
foreach ($node->child as $child) {
$options = array_merge($options, $this->collectSelectOptions($child));
return $options;
* Convenience method for collecting all the attributes
* of a tag. Not sure why Tidy does not have this.
* @param object $node Tidy XML node.
* @return array Hash of attribute strings.
private function attributes($node) {
if (! preg_match('|<[^ ]+\s(.*?)/?>|s', $node->value, $first_tag_contents)) {
return array();
$attributes = array();
preg_match_all('/\S+\s*=\s*\'[^\']*\'|(\S+\s*=\s*"[^"]*")|([^ =]+\s*=\s*[^ "\']+?)|[^ "\']+/', $first_tag_contents[1], $matches);
foreach($matches[0] as $unparsed) {
$attributes = $this->mergeAttribute($attributes, $unparsed);
return $attributes;
* Overlay an attribute into the attributes hash.
* @param array $attributes Current attribute list.
* @param string $raw Raw attribute string with
* both key and value.
* @return array New attribute hash.
private function mergeAttribute($attributes, $raw) {
$parts = explode('=', $raw);
list($name, $value) = count($parts) == 1 ? array($parts[0], $parts[0]) : $parts;
$attributes[trim($name)] = html_entity_decode($this->dequote(trim($value)), ENT_QUOTES);
return $attributes;
* Remove start and end quotes.
* @param string $quoted A quoted string.
* @return string Quotes are gone.
private function dequote($quoted) {
if (preg_match('/^(\'([^\']*)\'|"([^"]*)")$/', $quoted, $matches)) {
return isset($matches[3]) ? $matches[3] : $matches[2];
return $quoted;
* Collects frame information inside a frameset tag.
* @param object $node Tidy XML node.
* @return array List of SimpleTag frame descriptions.
private function collectFrames($node) {
$frames = array();
if ($node->name == 'frame') {
$frames = array($this->tags()->createTag($node->name, (array)$node->attribute));
} else if ($node->hasChildren()) {
$frames = array();
foreach ($node->child as $child) {
$frames = array_merge($frames, $this->collectFrames($child));
return $frames;
* Extracts the XML node text.
* @param object $node Tidy XML node.
* @return string The text only.
private function innerHtml($node) {
$raw = '';
if ($node->hasChildren()) {
foreach ($node->child as $child) {
$raw .= $child->value;
return $this->stripGuards($raw);
* Factory for parsed content holders.
* @return SimpleTagBuilder Factory.
private function tags() {
return new SimpleTagBuilder();
* Called at the end of a parse run. Attaches any
* non-wrapping labels to their form elements.
* @param array $widgets_by_id Cached SimpleTag hash.
* @param array $labels SimpleTag label elements.
private function attachLabels($widgets_by_id, $labels) {
foreach ($labels as $label) {
$for = $label->getFor();
if ($for and isset($widgets_by_id[$for])) {
$text = $label->getText();
foreach ($widgets_by_id[$for] as $widget) {