diff --git a/plugins.d/linkstats.php b/plugins.d/linkstats.php new file mode 100644 index 0000000..f127a07 --- /dev/null +++ b/plugins.d/linkstats.php @@ -0,0 +1,67 @@ +register( "CensusPluginLinkStats" ); +class CensusPluginLinkStats extends CensusPlugin +{ + + protected $id = "linkStats"; + + private function endsWith($haystack, $needle) + { + $length = strlen($needle); + if ($length == 0) { + return true; + } + + return (substr($haystack, -$length) === $needle); + } + + public function applyTo( $curl ) + { + $url_bits = parse_url($curl->info["url"]); + $hostname = $url_bits["host"]; + $dom = new DOMDocument(); + $urls = array(); + $externals = array(); + $internals = array(); + $scripts = array(); + $tld = implode(".", array_slice(explode(".", $hostname),-3)); + + @$dom->loadHTML( $curl->webpage ); + $xpath = new DOMXpath($dom); + $link_nodes = $xpath->query("//a"); + foreach($link_nodes as $link_node) { + $href = $link_node->getAttribute("href"); + $url_info = parse_url($href); + + if(empty($href)) { + continue; + } + if($url_info["scheme"] == "javascript") { + $scripts[] = $href; + } + else { + $data = array( + 'link' => $href, + 'text' => $link_node->nodeValue, + 'title' => $link_node->getAttribute("title"), + ); + if(!empty($url_info["scheme"])) { + if($hostname == $url_info["host"] || $this->endsWith($url_info["host"], $tld)) + { + $internals[] = $data; + } + else { + $externals[] = $data; + } + } + else { + $internals[] = $data; + } + } + } + $result = array("script" => $scripts, "external" => $externals, "internal" => $internals); + return $result; + } +} + diff --git a/plugins.d/textstats.php b/plugins.d/textstats.php index bbf0d45..8fb0790 100644 --- a/plugins.d/textstats.php +++ b/plugins.d/textstats.php @@ -10,11 +10,21 @@ class CensusPluginTextStats extends CensusPlugin { - protected $id = "textStats"; - + protected $id = "textStats"; + public function applyTo( $curl ) { global $textstatistics; + $dom = new DOMDocument(); + @$dom->loadHTML( $curl->webpage ); + + foreach(array("script", "style") as $tag) { + $tag_els = $dom->getElementsByTagName($tag); + foreach($tag_els as $tag_el) { + $tag_el->nodeValue = ""; + } + } + $text_data = $dom->textContent; $r = array(); // functions in library we're not using: @@ -23,29 +33,29 @@ public function applyTo( $curl ) //spache_difficult_word_count($curl->webpage) //words_with_three_syllables($curl->webpage, $blnCountProperNouns = true) //percentage_words_with_three_syllables($curl->webpage, $blnCountProperNouns = true) - $r["flesch_kincaid_reading_ease"] = $textstatistics->flesch_kincaid_reading_ease($curl->webpage); - $r["flesch_kincaid_grade"] = $textstatistics->flesch_kincaid_grade_level($curl->webpage); - $r["gunning_fog"] = $textstatistics->gunning_fog_score($curl->webpage); - $r["coleman_liau"] = $textstatistics->coleman_liau_index($curl->webpage); - $r["smog"] = $textstatistics->smog_index($curl->webpage); - $r["automated_readability"] = $textstatistics->automated_readability_index($curl->webpage); - $r["dale_chall_readability"] = $textstatistics->dale_chall_readability_score($curl->webpage); - $r["spache_readability"] = $textstatistics->spache_readability_score($curl->webpage); - - $r["letters"] = $textstatistics->letter_count($curl->webpage); - $r["sentences"] = $textstatistics->sentence_count($curl->webpage); - $r["words"] = $textstatistics->word_count($curl->webpage); - $r["syllables"] = $textstatistics->total_syllables($curl->webpage); - - $r["words_per_sentence"] = $textstatistics->average_words_per_sentence($curl->webpage); - $r["syllables_per_word"] = $textstatistics->average_syllables_per_word($curl->webpage); + $r["flesch_kincaid_reading_ease"] = $textstatistics->flesch_kincaid_reading_ease($text_data); + $r["flesch_kincaid_grade"] = $textstatistics->flesch_kincaid_grade_level($text_data); + $r["gunning_fog"] = $textstatistics->gunning_fog_score($text_data); + $r["coleman_liau"] = $textstatistics->coleman_liau_index($text_data); + $r["smog"] = $textstatistics->smog_index($text_data); + $r["automated_readability"] = $textstatistics->automated_readability_index($text_data); + $r["dale_chall_readability"] = $textstatistics->dale_chall_readability_score($text_data); + $r["spache_readability"] = $textstatistics->spache_readability_score($text_data); + + $r["letters"] = $textstatistics->letter_count($text_data); + $r["sentences"] = $textstatistics->sentence_count($text_data); + $r["words"] = $textstatistics->word_count($text_data); + $r["syllables"] = $textstatistics->total_syllables($text_data); + + $r["words_per_sentence"] = $textstatistics->average_words_per_sentence($text_data); + $r["syllables_per_word"] = $textstatistics->average_syllables_per_word($text_data); foreach( $r as $k=>&$v ) { $v = round( $v*100 )/100; } return $r; - } + } function resultToGraph( $graph, $result, $observation_uri ) { foreach( $result as $key=>$value )