<?php class fuckhtml{ public function __construct($html = null, $isfile = false){ if($html !== null){ $this->load($html, $isfile); } } public function load($html, $isfile = false){ if(is_array($html)){ if(!isset($html["innerHTML"])){ throw new Exception("(load) Supplied array doesn't contain a innerHTML index"); } $html = $html["innerHTML"]; } if($isfile){ $handle = fopen($html, "r"); $fetch = fread($handle, filesize($html)); fclose($handle); $this->html = $fetch; }else{ $this->html = $html; } $this->strlen = strlen($this->html); } public function getElementsByTagName(string $tagname){ $out = []; /* Scrape start of the tag. Example <div class="mydiv"> ... */ if($tagname == "*"){ $tagname = '[^\/<>\s]+'; }else{ $tagname = preg_quote(strtolower($tagname)); } preg_match_all( '/<\s*(' . $tagname . ')(\s(?:[^>\'"]*|"[^"]*"|\'[^\']*\')+)?\s*>/i', /* '/<\s*(' . $tagname . ')(\s[\S\s]*?)?>/i', */ $this->html, $starting_tags, PREG_OFFSET_CAPTURE ); for($i=0; $i<count($starting_tags[0]); $i++){ /* Parse attributes */ $attributes = []; preg_match_all( '/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/', $starting_tags[2][$i][0], $regex_attributes ); for($k=0; $k<count($regex_attributes[0]); $k++){ if(trim($regex_attributes[2][$k]) == ""){ $attributes[$regex_attributes[1][$k]] = "true"; continue; } $attributes[$regex_attributes[1][$k]] = trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00"); } $out[] = [ "tagName" => strtolower($starting_tags[1][$i][0]), "startPos" => $starting_tags[0][$i][1], "endPos" => 0, "startTag" => $starting_tags[0][$i][0], "attributes" => $attributes, "innerHTML" => null ]; } /* Get innerHTML */ // get closing tag positions preg_match_all( '/<\s*\/\s*(' . $tagname . ')\s*>/i', $this->html, $regex_closing_tags, PREG_OFFSET_CAPTURE ); // merge opening and closing tags together for($i=0; $i<count($regex_closing_tags[1]); $i++){ $out[] = [ "tagName" => strtolower($regex_closing_tags[1][$i][0]), "endTag" => $regex_closing_tags[0][$i][0], "startPos" => $regex_closing_tags[0][$i][1] ]; } usort( $out, function($a, $b){ return $a["startPos"] > $b["startPos"]; } ); // computer the indent level for each element $level = []; $count = count($out); for($i=0; $i<$count; $i++){ if(!isset($level[$out[$i]["tagName"]])){ $level[$out[$i]["tagName"]] = 0; } if(isset($out[$i]["startTag"])){ // encountered starting tag $level[$out[$i]["tagName"]]++; $out[$i]["level"] = $level[$out[$i]["tagName"]]; }else{ // encountered closing tag $out[$i]["level"] = $level[$out[$i]["tagName"]]; $level[$out[$i]["tagName"]]--; } } // if the indent level is the same for a div, // we encountered _THE_ closing tag for($i=0; $i<$count; $i++){ if(!isset($out[$i]["startTag"])){ continue; } for($k=$i; $k<$count; $k++){ if( isset($out[$k]["endTag"]) && $out[$i]["tagName"] == $out[$k]["tagName"] && $out[$i]["level"] === $out[$k]["level"] ){ $startlen = strlen($out[$i]["startTag"]); $endlen = strlen($out[$k]["endTag"]); $out[$i]["endPos"] = $out[$k]["startPos"] + $endlen; $out[$i]["innerHTML"] = substr( $this->html, $out[$i]["startPos"] + $startlen, $out[$k]["startPos"] - ($out[$i]["startPos"] + $startlen) ); $out[$i]["outerHTML"] = substr( $this->html, $out[$i]["startPos"], $out[$k]["startPos"] - $out[$i]["startPos"] + $endlen ); break; } } } // filter out ending divs for($i=0; $i<$count; $i++){ if(isset($out[$i]["endTag"])){ unset($out[$i]); } unset($out[$i]["startTag"]); } return array_values($out); } public function getElementsByAttributeName(string $name, $collection = null){ if($collection === null){ $collection = $this->getElementsByTagName("*"); }elseif(is_string($collection)){ $collection = $this->getElementsByTagName($collection); } $return = []; foreach($collection as $elem){ foreach($elem["attributes"] as $attrib_name => $attrib_value){ if($attrib_name == $name){ $return[] = $elem; continue 2; } } } return $return; } public function getElementsByFuzzyAttributeValue(string $name, string $value, $collection = null){ $elems = $this->getElementsByAttributeName($name, $collection); $value = explode(" ", $value); $return = []; foreach($elems as $elem){ foreach($elem["attributes"] as $attrib_name => $attrib_value){ $attrib_value = explode(" ", $attrib_value); $ac = count($attrib_value); $nc = count($value); $cr = 0; for($i=0; $i<$nc; $i++){ for($k=0; $k<$ac; $k++){ if($value[$i] == $attrib_value[$k]){ $cr++; } } } if($cr === $nc){ $return[] = $elem; continue 2; } } } return $return; } public function getElementsByAttributeValue(string $name, string $value, $collection = null){ $elems = $this->getElementsByAttributeName($name, $collection); $return = []; foreach($elems as $elem){ foreach($elem["attributes"] as $attrib_name => $attrib_value){ if($attrib_value == $value){ $return[] = $elem; continue 2; } } } return $return; } public function getElementById(string $idname, $collection = null){ $id = $this->getElementsByAttributeValue("id", $idname, $collection); if(count($id) !== 0){ return $id[0]; } return false; } public function getElementsByClassName(string $classname, $collection = null){ return $this->getElementsByFuzzyAttributeValue("class", $classname, $collection); } public function getTextContent($html, $whitespace = false, $trim = true){ if(is_array($html)){ if(!isset($html["innerHTML"])){ throw new Exception("(getTextContent) Supplied array doesn't contain a innerHTML index"); } $html = $html["innerHTML"]; } $html = preg_split('/\n|<\/?br>/i', $html); $out = ""; for($i=0; $i<count($html); $i++){ $tmp = html_entity_decode( strip_tags( $html[$i] ), ENT_QUOTES | ENT_XML1, "UTF-8" ); if($trim){ $tmp = trim($tmp); } $out .= $tmp; if($whitespace === true){ $out .= "\n"; }else{ $out .= " "; } } if($trim){ return trim($out); } return $out; } public function parseJsObject(string $json){ $bracket = false; $is_close_bracket = false; $escape = false; $json_out = null; $last_char = null; for($i=0; $i<strlen($json); $i++){ switch($json[$i]){ case "\"": case "'": if($escape === true){ break; } if($json[$i] == $bracket){ $bracket = false; $is_close_bracket = true; }else{ if($bracket === false){ $bracket = $json[$i]; } } break; default: $is_close_bracket = false; break; } $escape = $json[$i] == "\\" ? true : false; if( $bracket === false && $is_close_bracket === false ){ // here we know we're not iterating over a quoted string switch($json[$i]){ case "[": case "{": // dont execute whats in "default" $json_out .= $json[$i]; break; case "]": case "}": case ",": case ":": if(!in_array($last_char, ["[", "{", "}", "]", "\""])){ $json_out .= "\""; } $json_out .= $json[$i]; break; default: if(in_array($last_char, ["{", "[", ",", ":"])){ $json_out .= "\""; } $json_out .= $json[$i]; break; } }else{ $json_out .= $json[$i]; } $last_char = $json[$i]; } return json_decode($json_out, true); } } ?>