From b116a3a46d774b1f4f186d015aad6a6f69f4ec96 Mon Sep 17 00:00:00 2001 From: Mike Jewell Date: Mon, 1 Dec 2014 19:24:45 +0000 Subject: [PATCH 1/4] Added linkstats plugin --- plugins.d/linkstats.php | 66 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 plugins.d/linkstats.php diff --git a/plugins.d/linkstats.php b/plugins.d/linkstats.php new file mode 100644 index 0000000..3d859ae --- /dev/null +++ b/plugins.d/linkstats.php @@ -0,0 +1,66 @@ +register( "CensusPluginLinkStats" ); +class CensusPluginLinkStats extends CensusPlugin +{ + + protected $id = "linkstats"; + + private function endsWith($haystack, $needle) + { + $length = strlen($needle); + if ($length == 0) { + return true; + } + + return (substr($haystack, -$length) === $needle); + } + + public function applyTo( $curl ) + { + $url_bits = parse_url($curl->info["url"]); + $hostname = $url_bits["host"]; + $dom = new DOMDocument(); + $urls = array(); + $externals = array(); + $internals = array(); + $scripts = array(); + $tld = implode(".", array_slice(explode(".", $hostname),-3)); + + + @$dom->loadHTML( $curl->webpage ); + $xpath = new DOMXpath($dom); + $link_nodes = $xpath->query("//a"); + foreach($link_nodes as $link_node) { + $href = $link_node->getAttribute("href"); + $url_info = parse_url($href); + + + if($url_info["scheme"] == "javascript") { + $scripts[] = $href; + } + else { + $data = array( + 'link' => $href, + 'text' => $link_node->nodeValue, + 'title' => $link_node->getAttribute("title"), + ); + if(!empty($url_info["scheme"])) { + if($hostname == $url_info["host"] || $this->endsWith($url_info["host"], $tld)) + { + $internals[] = $data; + } + else { + $externals[] = $data; + } + } + else { + $internals[] = $data; + } + } + } + $result = array("script" => $scripts, "external" => $externals, "internal" => $internals); + return $result; + } +} + From 8b3692722869011e58e0fdb126c3d8569a49ebf8 Mon Sep 17 00:00:00 2001 From: Mike Jewell Date: Mon, 1 Dec 2014 19:44:32 +0000 Subject: [PATCH 2/4] Handles empty urls --- plugins.d/linkstats.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/plugins.d/linkstats.php b/plugins.d/linkstats.php index 3d859ae..68698f1 100644 --- a/plugins.d/linkstats.php +++ b/plugins.d/linkstats.php @@ -27,7 +27,6 @@ public function applyTo( $curl ) $scripts = array(); $tld = implode(".", array_slice(explode(".", $hostname),-3)); - @$dom->loadHTML( $curl->webpage ); $xpath = new DOMXpath($dom); $link_nodes = $xpath->query("//a"); @@ -35,7 +34,9 @@ public function applyTo( $curl ) $href = $link_node->getAttribute("href"); $url_info = parse_url($href); - + if(empty($href)) { + continue; + } if($url_info["scheme"] == "javascript") { $scripts[] = $href; } From b3cd2757d1af0d817539d0c04f90d4c1a7415c2b Mon Sep 17 00:00:00 2001 From: Mike Jewell Date: Mon, 1 Dec 2014 20:14:24 +0000 Subject: [PATCH 3/4] Now uses text content, rather than html. --- plugins.d/textstats.php | 48 +++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/plugins.d/textstats.php b/plugins.d/textstats.php index bbf0d45..8fb0790 100644 --- a/plugins.d/textstats.php +++ b/plugins.d/textstats.php @@ -10,11 +10,21 @@ class CensusPluginTextStats extends CensusPlugin { - protected $id = "textStats"; - + protected $id = "textStats"; + public function applyTo( $curl ) { global $textstatistics; + $dom = new DOMDocument(); + @$dom->loadHTML( $curl->webpage ); + + foreach(array("script", "style") as $tag) { + $tag_els = $dom->getElementsByTagName($tag); + foreach($tag_els as $tag_el) { + $tag_el->nodeValue = ""; + } + } + $text_data = $dom->textContent; $r = array(); // functions in library we're not using: @@ -23,29 +33,29 @@ public function applyTo( $curl ) //spache_difficult_word_count($curl->webpage) //words_with_three_syllables($curl->webpage, $blnCountProperNouns = true) //percentage_words_with_three_syllables($curl->webpage, $blnCountProperNouns = true) - $r["flesch_kincaid_reading_ease"] = $textstatistics->flesch_kincaid_reading_ease($curl->webpage); - $r["flesch_kincaid_grade"] = $textstatistics->flesch_kincaid_grade_level($curl->webpage); - $r["gunning_fog"] = $textstatistics->gunning_fog_score($curl->webpage); - $r["coleman_liau"] = $textstatistics->coleman_liau_index($curl->webpage); - $r["smog"] = $textstatistics->smog_index($curl->webpage); - $r["automated_readability"] = $textstatistics->automated_readability_index($curl->webpage); - $r["dale_chall_readability"] = $textstatistics->dale_chall_readability_score($curl->webpage); - $r["spache_readability"] = $textstatistics->spache_readability_score($curl->webpage); - - $r["letters"] = $textstatistics->letter_count($curl->webpage); - $r["sentences"] = $textstatistics->sentence_count($curl->webpage); - $r["words"] = $textstatistics->word_count($curl->webpage); - $r["syllables"] = $textstatistics->total_syllables($curl->webpage); - - $r["words_per_sentence"] = $textstatistics->average_words_per_sentence($curl->webpage); - $r["syllables_per_word"] = $textstatistics->average_syllables_per_word($curl->webpage); + $r["flesch_kincaid_reading_ease"] = $textstatistics->flesch_kincaid_reading_ease($text_data); + $r["flesch_kincaid_grade"] = $textstatistics->flesch_kincaid_grade_level($text_data); + $r["gunning_fog"] = $textstatistics->gunning_fog_score($text_data); + $r["coleman_liau"] = $textstatistics->coleman_liau_index($text_data); + $r["smog"] = $textstatistics->smog_index($text_data); + $r["automated_readability"] = $textstatistics->automated_readability_index($text_data); + $r["dale_chall_readability"] = $textstatistics->dale_chall_readability_score($text_data); + $r["spache_readability"] = $textstatistics->spache_readability_score($text_data); + + $r["letters"] = $textstatistics->letter_count($text_data); + $r["sentences"] = $textstatistics->sentence_count($text_data); + $r["words"] = $textstatistics->word_count($text_data); + $r["syllables"] = $textstatistics->total_syllables($text_data); + + $r["words_per_sentence"] = $textstatistics->average_words_per_sentence($text_data); + $r["syllables_per_word"] = $textstatistics->average_syllables_per_word($text_data); foreach( $r as $k=>&$v ) { $v = round( $v*100 )/100; } return $r; - } + } function resultToGraph( $graph, $result, $observation_uri ) { foreach( $result as $key=>$value ) From 3b65246c3440d6631929b16bf171e43846ec633d Mon Sep 17 00:00:00 2001 From: Mike Jewell Date: Mon, 1 Dec 2014 20:28:32 +0000 Subject: [PATCH 4/4] linkstats -> linkStats --- plugins.d/linkstats.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins.d/linkstats.php b/plugins.d/linkstats.php index 68698f1..f127a07 100644 --- a/plugins.d/linkstats.php +++ b/plugins.d/linkstats.php @@ -4,7 +4,7 @@ class CensusPluginLinkStats extends CensusPlugin { - protected $id = "linkstats"; + protected $id = "linkStats"; private function endsWith($haystack, $needle) {