<?php // todo: // aliexpress tracking links // enhanced msx notice class google{ private const is_class = "."; private const is_id = "#"; public function __construct(){ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); include "lib/backend.php"; $this->backend = new backend("google"); } public function getfilters($page){ $base = [ "country" => [ // gl=<country> "display" => "Country", "option" => [ "any" => "Instance's country", "af" => "Afghanistan", "al" => "Albania", "dz" => "Algeria", "as" => "American Samoa", "ad" => "Andorra", "ao" => "Angola", "ai" => "Anguilla", "aq" => "Antarctica", "ag" => "Antigua and Barbuda", "ar" => "Argentina", "am" => "Armenia", "aw" => "Aruba", "au" => "Australia", "at" => "Austria", "az" => "Azerbaijan", "bs" => "Bahamas", "bh" => "Bahrain", "bd" => "Bangladesh", "bb" => "Barbados", "by" => "Belarus", "be" => "Belgium", "bz" => "Belize", "bj" => "Benin", "bm" => "Bermuda", "bt" => "Bhutan", "bo" => "Bolivia", "ba" => "Bosnia and Herzegovina", "bw" => "Botswana", "bv" => "Bouvet Island", "br" => "Brazil", "io" => "British Indian Ocean Territory", "bn" => "Brunei Darussalam", "bg" => "Bulgaria", "bf" => "Burkina Faso", "bi" => "Burundi", "kh" => "Cambodia", "cm" => "Cameroon", "ca" => "Canada", "cv" => "Cape Verde", "ky" => "Cayman Islands", "cf" => "Central African Republic", "td" => "Chad", "cl" => "Chile", "cn" => "China", "cx" => "Christmas Island", "cc" => "Cocos (Keeling) Islands", "co" => "Colombia", "km" => "Comoros", "cg" => "Congo", "cd" => "Congo, the Democratic Republic", "ck" => "Cook Islands", "cr" => "Costa Rica", "ci" => "Cote D'ivoire", "hr" => "Croatia", "cu" => "Cuba", "cy" => "Cyprus", "cz" => "Czech Republic", "dk" => "Denmark", "dj" => "Djibouti", "dm" => "Dominica", "do" => "Dominican Republic", "ec" => "Ecuador", "eg" => "Egypt", "sv" => "El Salvador", "gq" => "Equatorial Guinea", "er" => "Eritrea", "ee" => "Estonia", "et" => "Ethiopia", "fk" => "Falkland Islands (Malvinas)", "fo" => "Faroe Islands", "fj" => "Fiji", "fi" => "Finland", "fr" => "France", "gf" => "French Guiana", "pf" => "French Polynesia", "tf" => "French Southern Territories", "ga" => "Gabon", "gm" => "Gambia", "ge" => "Georgia", "de" => "Germany", "gh" => "Ghana", "gi" => "Gibraltar", "gr" => "Greece", "gl" => "Greenland", "gd" => "Grenada", "gp" => "Guadeloupe", "gu" => "Guam", "gt" => "Guatemala", "gn" => "Guinea", "gw" => "Guinea-Bissau", "gy" => "Guyana", "ht" => "Haiti", "hm" => "Heard Island and Mcdonald Islands", "va" => "Holy See (Vatican City State)", "hn" => "Honduras", "hk" => "Hong Kong", "hu" => "Hungary", "is" => "Iceland", "in" => "India", "id" => "Indonesia", "ir" => "Iran, Islamic Republic", "iq" => "Iraq", "ie" => "Ireland", "il" => "Israel", "it" => "Italy", "jm" => "Jamaica", "jp" => "Japan", "jo" => "Jordan", "kz" => "Kazakhstan", "ke" => "Kenya", "ki" => "Kiribati", "kp" => "Korea, Democratic People's Republic", "kr" => "Korea, Republic", "kw" => "Kuwait", "kg" => "Kyrgyzstan", "la" => "Lao People's Democratic Republic", "lv" => "Latvia", "lb" => "Lebanon", "ls" => "Lesotho", "lr" => "Liberia", "ly" => "Libyan Arab Jamahiriya", "li" => "Liechtenstein", "lt" => "Lithuania", "lu" => "Luxembourg", "mo" => "Macao", "mk" => "Macedonia, the Former Yugosalv Republic", "mg" => "Madagascar", "mw" => "Malawi", "my" => "Malaysia", "mv" => "Maldives", "ml" => "Mali", "mt" => "Malta", "mh" => "Marshall Islands", "mq" => "Martinique", "mr" => "Mauritania", "mu" => "Mauritius", "yt" => "Mayotte", "mx" => "Mexico", "fm" => "Micronesia, Federated States", "md" => "Moldova, Republic", "mc" => "Monaco", "mn" => "Mongolia", "ms" => "Montserrat", "ma" => "Morocco", "mz" => "Mozambique", "mm" => "Myanmar", "na" => "Namibia", "nr" => "Nauru", "np" => "Nepal", "nl" => "Netherlands", "an" => "Netherlands Antilles", "nc" => "New Caledonia", "nz" => "New Zealand", "ni" => "Nicaragua", "ne" => "Niger", "ng" => "Nigeria", "nu" => "Niue", "nf" => "Norfolk Island", "mp" => "Northern Mariana Islands", "no" => "Norway", "om" => "Oman", "pk" => "Pakistan", "pw" => "Palau", "ps" => "Palestinian Territory, Occupied", "pa" => "Panama", "pg" => "Papua New Guinea", "py" => "Paraguay", "pe" => "Peru", "ph" => "Philippines", "pn" => "Pitcairn", "pl" => "Poland", "pt" => "Portugal", "pr" => "Puerto Rico", "qa" => "Qatar", "re" => "Reunion", "ro" => "Romania", "ru" => "Russian Federation", "rw" => "Rwanda", "sh" => "Saint Helena", "kn" => "Saint Kitts and Nevis", "lc" => "Saint Lucia", "pm" => "Saint Pierre and Miquelon", "vc" => "Saint Vincent and the Grenadines", "ws" => "Samoa", "sm" => "San Marino", "st" => "Sao Tome and Principe", "sa" => "Saudi Arabia", "sn" => "Senegal", "cs" => "Serbia and Montenegro", "sc" => "Seychelles", "sl" => "Sierra Leone", "sg" => "Singapore", "sk" => "Slovakia", "si" => "Slovenia", "sb" => "Solomon Islands", "so" => "Somalia", "za" => "South Africa", "gs" => "South Georgia and the South Sandwich Islands", "es" => "Spain", "lk" => "Sri Lanka", "sd" => "Sudan", "sr" => "Suriname", "sj" => "Svalbard and Jan Mayen", "sz" => "Swaziland", "se" => "Sweden", "ch" => "Switzerland", "sy" => "Syrian Arab Republic", "tw" => "Taiwan, Province of China", "tj" => "Tajikistan", "tz" => "Tanzania, United Republic", "th" => "Thailand", "tl" => "Timor-Leste", "tg" => "Togo", "tk" => "Tokelau", "to" => "Tonga", "tt" => "Trinidad and Tobago", "tn" => "Tunisia", "tr" => "Turkey", "tm" => "Turkmenistan", "tc" => "Turks and Caicos Islands", "tv" => "Tuvalu", "ug" => "Uganda", "ua" => "Ukraine", "ae" => "United Arab Emirates", "uk" => "United Kingdom", "us" => "United States", "um" => "United States Minor Outlying Islands", "uy" => "Uruguay", "uz" => "Uzbekistan", "vu" => "Vanuatu", "ve" => "Venezuela", "vn" => "Viet Nam", "vg" => "Virgin Islands, British", "vi" => "Virgin Islands, U.S.", "wf" => "Wallis and Futuna", "eh" => "Western Sahara", "ye" => "Yemen", "zm" => "Zambia", "zw" => "Zimbabwe" ] ], "nsfw" => [ "display" => "NSFW", "option" => [ "yes" => "Yes", // safe=active "no" => "No" // safe=off ] ], "lang" => [ // lr=<lang> (prefix lang with "lang_") "display" => "Language", "option" => [ "any" => "Any language", "ar" => "Arabic", "bg" => "Bulgarian", "ca" => "Catalan", "cs" => "Czech", "da" => "Danish", "de" => "German", "el" => "Greek", "en" => "English", "es" => "Spanish", "et" => "Estonian", "fi" => "Finnish", "fr" => "French", "hr" => "Croatian", "hu" => "Hungarian", "id" => "Indonesian", "is" => "Icelandic", "it" => "Italian", "iw" => "Hebrew", "ja" => "Japanese", "ko" => "Korean", "lt" => "Lithuanian", "lv" => "Latvian", "nl" => "Dutch", "no" => "Norwegian", "pl" => "Polish", "pt" => "Portuguese", "ro" => "Romanian", "ru" => "Russian", "sk" => "Slovak", "sl" => "Slovenian", "sr" => "Serbian", "sv" => "Swedish", "tr" => "Turkish", "zh-CN" => "Chinese (Simplified)", "zh-TW" => "Chinese (Traditional)" ] ] ]; switch($page){ case "web": return array_merge( $base, [ "newer" => [ // &sort=review-date:r:20090301:20090430 "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" ] ] ); break; case "images": return array_merge( $base, [ "time" => [ // tbs=qrd:<size> "display" => "Time posted", "option" => [ "any" => "Any time", "d" => "Past 24 hours", "w" => "Past week", "m" => "Past month", "y" => "Past year" ] ], "size" => [ "display" => "Size", "option" => [ // tbs=isz:<size> "any" => "Any size", "l" => "Large", "m" => "Medium", "i" => "Icon", // from here // tbz:lt,islt:<size> "qsvga" => "Larger than 400x300", "vga" => "Larger than 640x480", "qsvga" => "Larger than 800x600", "xga" => "Larger than 1024x768", "2mp" => "Larger than 2MP", "4mp" => "Larger than 4MP", "6mp" => "Larger than 6MP", "8mp" => "Larger than 8MP", "10mp" => "Larger than 10MP", "12mp" => "Larger than 12MP", "15mp" => "Larger than 15MP", "20mp" => "Larger than 20MP", "40mp" => "Larger than 40MP", "70mp" => "Larger than 70MP" ] ], "ratio" => [ // tbs=iar:<size> "display" => "Aspect ratio", "option" => [ "any" => "Any ratio", "t" => "Tall", "s" => "Square", "w" => "Wide", "xw" => "Panoramic" ] ], "color" => [ // tbs=ic:<color> "display" => "Color", "option" => [ "any" => "Any color", "color" => "Full color", "gray" => "Black & white", "trans" => "Transparent", // from there, its ic:specific,isc:<color> "red" => "Red", "orange" => "Orange", "yellow" => "Yellow", "green" => "Green", "teal" => "Teal", "blue" => "Blue", "purple" => "Purple", "pink" => "Pink", "white" => "White", "gray" => "Gray", "black" => "Black", "brown" => "Brown" ] ], "type" => [ // tbs=itp:<type> "display" => "Type", "option" => [ "any" => "Any type", "face" => "Faces", "clipart" => "Clip Art", "lineart" => "Line Drawing", "stock" => "Stock", "animated" => "Animated" ] ], "format" => [ // tbs=ift:<format> "display" => "Format", "option" => [ "any" => "Any format", "jpg" => "JPG", "gif" => "GIF", "png" => "PNG", "bmp" => "BMP", "svg" => "SVG", "webp" => "WEBP", "ico" => "ICO", "craw" => "RAW" ] ], "rights" => [ // tbs=il:<rights> "display" => "Usage rights", "option" => [ "any" => "Any license", "cl" => "Creative Commons licenses", "ol" => "Commercial & other licenses" ] ] ] ); break; case "videos": return array_merge( $base, [ "time" => [ "display" => "Time posted", "option" => [ // tbs=qdr "any" => "Any time", "h" => "Past hour", "d" => "Past 24 hours", "w" => "Past week", "m" => "Past month", "y" => "Past year" ] ], "duration" => [ "display" => "Duration", "option" => [ "any" => "Any duration", "s" => "Short (0-4min)", // tbs=dur:s "m" => "Medium (4-20min)", // tbs=dur:m "l" => "Long (20+ min)" // tbs=dur:l ] ], "quality" => [ "display" => "Quality", "option" => [ "any" => "Any quality", "h" => "High quality" // tbs=hq:h ] ], "captions" => [ "display" => "Captions", "option" => [ "any" => "No preference", "yes" => "Closed captioned" // tbs=cc:1 ] ] ] ); break; case "news": return array_merge( $base, [ "time" => [ "display" => "Time posted", "option" => [ // tbs=qdr "any" => "Any time", "h" => "Past hour", "d" => "Past 24 hours", "w" => "Past week", "m" => "Past month", "y" => "Past year", "a" => "Archives" // tbs=ar:1 ] ], "sort" => [ "display" => "Sort", "option" => [ "relevance" => "Relevance", "date" => "Date" // sbd:1 ] ] ] ); break; } } private function get($proxy, $url, $get = []){ $headers = [ "User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.3; pt-pt; LG-P500h-parrot Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", "Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1" ]; $curlproc = curl_init(); if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } curl_close($curlproc); return $data; } public function web($get){ if($get["npt"]){ [$req, $ip] = $this->backend->get($get["npt"], "web"); parse_str( parse_url($req, PHP_URL_QUERY), $search ); if(isset($search["q"])){ $search = $search["q"]; }else{ $search = "a"; // lol } try{ $html = $this->get( $ip, "https://www.google.com" . $req, [] ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } }else{ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $lang = $get["lang"]; $older = $get["older"]; $newer = $get["newer"]; $ip = $this->backend->get_ip(); $params = [ "q" => $search, "hl" => "en", "num" => 20 // get 20 results ]; // country if($country != "any"){ $params["gl"] = $country; } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; // language if($lang != "any"){ $params["lr"] = "lang_" . $lang; } // &sort=review-date:r:20090301:20090430 $older = $older === false ? false : date("Ymd", $older); $newer = $newer === false ? false : date("Ymd", $newer); if( $older !== false && $newer === false ){ $newer = date("Ymd", time()); } if( $older !== false || $newer !== false ){ $params["sort"] = "review-date:r:" . $older . ":" . $newer; } try{ $html = $this->get( $ip, "https://www.google.com/search", $params ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } //$html = file_get_contents("scraper/google.html"); } return $this->parsepage($html, "web", $search, $ip); } public function video($get){ if($get["npt"]){ [$req, $ip] = $this->backend->get($get["npt"], "videos"); parse_str( parse_url($req, PHP_URL_QUERY), $search ); if(isset($search["q"])){ $search = $search["q"]; }else{ $search = "a"; // lol } try{ $html = $this->get( $ip, "https://www.google.com" . $req, [] ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } }else{ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $lang = $get["lang"]; $time = $get["time"]; $duration = $get["duration"]; $quality = $get["quality"]; $captions = $get["captions"]; $ip = $this->backend->get_ip(); $params = [ "q" => $search, "tbm" => "vid", "hl" => "en", "num" => "20" ]; // country if($country != "any"){ $params["gl"] = $country; } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; // language if($lang != "any"){ $params["lr"] = "lang_" . $lang; } $tbs = []; // time if($time != "any"){ $tbs[] = "qdr:" . $time; } // duration if($duration != "any"){ $tbs[] = "dur:" . $duration; } // quality if($quality != "any"){ $tbs[] = "hq:" . $quality; } // captions if($captions != "any"){ $tbs[] = "cc:" . $captions; } // append tbs if(count($tbs) !== 0){ $params["tbs"] = implode(",", $tbs); } try{ $html = $this->get( $ip, "https://www.google.com/search", $params ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } } $json = $this->parsepage($html, "videos", $search, $ip); $out = [ "status" => "ok", "npt" => $json["npt"], "video" => [], "author" => [], "livestream" => [], "playlist" => [], "reel" => [] ]; foreach($json["web"] as $item){ $out["video"][] = [ "title" => $item["title"], "description" => $item["description"], "author" => [ "name" => null, "url" => null, "avatar" => null ], "date" => isset($item["table"]["Posted"]) ? strtotime($item["table"]["Posted"]) : null, "duration" => isset($item["table"]["Duration"]) ? $this->hms2int($item["table"]["Duration"]) : null, "views" => null, "thumb" => $item["thumb"]["url"] === null ? [ "url" => null, "ratio" => null ] : [ "url" => $item["thumb"]["url"], "ratio" => "16:9" ], "url" => $item["url"] ]; } return $out; } public function news($get){ if($get["npt"]){ [$req, $ip] = $this->backend->get($get["npt"], "news"); parse_str( parse_url($req, PHP_URL_QUERY), $search ); if(isset($search["q"])){ $search = $search["q"]; }else{ $search = "a"; // lol } try{ $html = $this->get( $ip, "https://www.google.com" . $req, [] ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } }else{ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $lang = $get["lang"]; $time = $get["time"]; $sort = $get["sort"]; $ip = $this->backend->get_ip(); $params = [ "q" => $search, "tbm" => "nws", "hl" => "en", "num" => "20" ]; // country if($country != "any"){ $params["gl"] = $country; } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; // language if($lang != "any"){ $params["lr"] = "lang_" . $lang; } $tbs = []; // time if($time != "any"){ if($time == "a"){ $tbs[] = "ar:1"; }else{ $tbs[] = "qdr:" . $time; } } // relevance if($sort == "date"){ $tbs[] = "sbd:1"; } // append tbs if(count($tbs) !== 0){ $params["tbs"] = implode(",", $tbs); } $html = $this->get( $ip, "https://www.google.com/search", $params ); } $json = $this->parsepage($html, "news", $search, $ip); $out = [ "status" => "ok", "npt" => $json["npt"], "news" => [] ]; foreach($json["web"] as $item){ $description = array_key_first($item["table"]); if($description !== null){ $date = $item["table"][$description]; }else{ $date = null; } $out["news"][] = [ "title" => $item["title"], "author" => $item["author"], "description" => $description, "date" => strtotime($date), "thumb" => $item["thumb"]["url"] === null ? [ "url" => null, "ratio" => null ] : [ "url" => $item["thumb"]["url"], "ratio" => "16:9" ], "url" => $item["url"] ]; } return $out; } private function parsepage($html, $pagetype, $search, $ip){ /* $handle = fopen("scraper/google.html", "r"); $html = fread($handle, filesize("scraper/google.html")); fclose($handle); */ $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; if($error = $this->detect_sorry($html)){ throw new Exception($error); } $this->parsejavascript($html); // // parse accdef's // $has_appended_accdef = false; preg_match_all( '/window\.jsl\.dh\(\'(accdef_[0-9]+)\',\'(.*)\'\);/', $html, $accdefs_regex ); $accdefs = []; for($i=0; $i<count($accdefs_regex[0]); $i++){ // decode UTF-16 string $answer = $this->fuckhtml ->parseJsString( $accdefs_regex[2][$i] ); $this->fuckhtml->load($answer); // get description $description = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "padding" => "12px 16px 12px", ], self::is_class ), "div" )[1]; // get date (rare) $date = $this->fuckhtml ->getElementsByTagName("sub"); if(count($date) !== 0){ $description = str_replace( $date[0]["outerHTML"], "", $description["innerHTML"] ); $date = strtotime( $this->fuckhtml ->getTextContent( $date[0] ) ); }else{ $date = null; } // get information table $table = []; $tbody = $this->fuckhtml ->getElementsByTagName("tbody"); if(count($tbody) !== 0){ $this->fuckhtml->load($tbody[0]); $trs = $this->fuckhtml ->getElementsByTagName("tr"); foreach($trs as $tr){ $this->fuckhtml->load($tr); $tds = $this->fuckhtml ->getElementsByTagName("td"); if(count($tds) === 2){ $table[ $this->fuckhtml ->getTextContent( $tds[0] ) ] = $this->fuckhtml ->getTextContent( $tds[1] ); } } // load back what we had $this->fuckhtml->load($answer); } // get title & link $a = $this->fuckhtml ->getElementsByTagName("a")[0]; $this->fuckhtml->load($a); $title = $this->fuckhtml ->getElementsByTagName("span"); if(count($title) === 0){ continue; } $accdefs[] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $title[0] ) ), "description" => $this->fuckhtml ->getTextContent( $description ), "url" => $this->unshiturl( $a["attributes"]["href"] ), "date" => $date, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], "sublink" => [], "table" => $table ]; } $this->fuckhtml->load($html); $containers = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "background-color" => "#fff", "margin-bottom" => "10px", "-webkit-box-shadow" => "0 1px 6px rgba(32,33,36,0.28)", "border-radius" => "8px" ], self::is_class ), "div" ); foreach($containers as $container){ $this->fuckhtml->load($container); // detect spelling $spelling = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "font-size" => "20px", "line-height" => "26px", "padding-top" => "2px", "margin-bottom" => "1px" ], self::is_class ), "div" ); if(count($spelling) !== 0){ $a = $this->fuckhtml ->getElementsByTagName("a"); if(count($a) !== 0){ $scripts = $this->fuckhtml ->getElementsByTagName("script"); foreach($scripts as $script){ $container["innerHTML"] = str_replace( $script["outerHTML"], "", $container["innerHTML"] ); } $container["innerHTML"] = $this->fuckhtml ->getTextContent( str_replace( $a[0]["outerHTML"], "", $container["innerHTML"] ) ); if( preg_match( '/^did you mean/i', $container["innerHTML"] ) ){ $out["spelling"] = [ "type" => "not_many", "using" => $search, "correction" => $this->fuckhtml ->getTextContent( $a[0] ) ]; } elseif( preg_match( '/^showing results for/i', $container["innerHTML"] ) ){ $out["spelling"] = [ "type" => "including", "using" => $this->fuckhtml ->getTextContent( $a[0] ), "correction" => $search ]; } } continue; } $title = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "color" => "#1967d2", "font-size" => "20px", "line-height" => "26px" ], self::is_class ), "div" ); if(count($title) !== 0){ // // Container is a web link // $web = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $title[0] ) ), "description" => null, "url" => null, "date" => null, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], "sublink" => [], "table" => [] ]; // get link $web["url"] = $this->unshiturl( $this->fuckhtml ->getElementsByTagName("a") [0] ["attributes"] ["href"] ); // // check if link contains a carousel // $carousels = $this->parsecarousels(); if(count($carousels) !== 0){ $first = true; foreach($carousels as $carousel_cat){ foreach($carousel_cat as $carousel){ if($first === true){ $first = false; }elseif($carousel["image"] !== null){ $out["image"][] = [ "title" => $carousel["title"], "source" => [ [ "url" => $carousel["image"], "width" => null, "height" => null ] ], "url" => $carousel["url"] ]; } $web["sublink"][] = [ "title" => $carousel["title"], "date" => $carousel["date"], "description" => $carousel["description"], "url" => $carousel["url"] ]; } } if($carousels[0][0]["image"] !== null){ $web["thumb"] = [ "url" => $carousels[0][0]["image"], "ratio" => "16:9" ]; } $out["web"][] = $web; continue; } // // no carousel entries, parse as normal link // $this->fuckhtml->load($container); // parse URL $web["url"] = $this->unshiturl( $this->fuckhtml ->getElementsByTagName("a") [0] ["attributes"] ["href"] ); $container = $container["innerHTML"]; $line_detect = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "height" => "1px", "background-color" => "#dadce0", "margin" => "0 16px" ], self::is_class ), "div" ); if(count($line_detect) !== 0){ // we found a line, this means we're dealing with a // "featured snippet" $featured = true; $description_container = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "white-space" => "pre-line", "word-wrap" => "break-word" ], self::is_class ), "div" )[1]; // get date node for it $date = $this->fuckhtml ->getElementsByTagName("sub"); if(count($date) !== 0){ $web["date"] = strtotime( $this->fuckhtml ->getTextContent( $date[0] ) ); } }else{ // we're dealing with a normal link $featured = false; $description_container = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "padding" => "12px 16px 12px" ], self::is_class ), "div" )[1]; } // // Get author if we're parsing news // if($pagetype == "news"){ $author = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "position" => "absolute", "width" => "100%", "top" => "0", "left" => "0", "padding-top" => "1px", "margin-bottom" => "-1px" ], self::is_class ), "div" ); if(count($author) !== 0){ $web["author"] = $this->fuckhtml ->getTextContent( $author[0] ); }else{ $web["author"] = null; } } $description = $description_container["innerHTML"]; $this->fuckhtml->load($description); // // get thumbnail before we call loadhtml again // $img = $this->fuckhtml ->getElementsByTagName("img"); if(count($img) !== 0){ $skip = true; if( isset($img[0]["attributes"]["alt"]) && stripos($img[0]["attributes"]["alt"], "Video for") !== false ){ // is a video thumbnail $web["thumb"]["ratio"] = "16:9"; }else{ // is a google thumbnail $web["thumb"]["ratio"] = "1:1"; } $web["thumb"]["url"] = $this->getimage( $img[0]["attributes"]["id"] ); }else{ $skip = false; } // // get sublinks // $links = $this->fuckhtml ->getElementsByTagName("a"); foreach($links as $link){ if($skip === true){ $skip = false; continue; } $description = str_replace( $link["outerHTML"], "", $description ); $sublink = [ "title" => null, "description" => null, "url" => null, "date" => null ]; $sublink["title"] = $this->titledots( $this->fuckhtml ->getTextContent( $link ) ); $sublink["url"] = $this->unshiturl( $link ["attributes"] ["href"] ); if(parse_url($sublink["url"], PHP_URL_HOST) !== null){ $web["sublink"][] = $sublink; } } // // Parse spans in description // $this->fuckhtml->load($description); if($featured === false){ $levels = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "padding-bottom" => "8px" ], self::is_class ), "div" ); // oh my god yes, fucking great, sometimes there are NO levels // hahahahahhahahahahahahahahahhahaa if(count($levels) === 0){ $levels = [$description]; } foreach($levels as $level){ $this->fuckhtml->load($level); $spans = $this->fuckhtml ->getElementsByTagName( "span" ); $is_rating = -1; foreach($spans as $span){ $innertext = trim( $this->fuckhtml ->getTextContent( $span ), " ·." ); if($innertext == ""){ continue; } if( strtolower($innertext) == "rating" ){ $is_rating = 0; // clean up before we go $description = str_replace( $span["outerHTML"], "", $description ); continue; } // // Parse rating object // if($is_rating >= 0){ // clean up description $description = str_replace( $span["outerHTML"], "", $description ); if($span["level"] !== 1){ continue; } $is_rating++; // 10/10 (123) if($is_rating === 1){ $innertext = explode(" ", $innertext, 2); $web["table"]["Rating"] = $innertext[0]; if(count($innertext) === 2){ $web["table"]["Hits"] = trim( str_replace( [ "(", ")" ], "", $innertext[1] ) ); if($web["table"]["Hits"] == ""){ unset($web["table"]["Hits"]); } } continue; } // US$4.99 // MYR 50.00 // $38.34 // JP¥6,480 // Reviewed by your mom if($is_rating === 2){ if( preg_match( '/^Review by (.+)/', $innertext, $match ) ){ $web["table"]["Author"] = $match[1]; continue; } $web["table"]["Price"] = $innertext; continue; } // Android / In stock if($is_rating === 3){ $web["table"]["Support"] = $innertext; continue; } // ignore the rest continue; } // // Parse standalone text // // If we reach this point: // 1. Ratings have been parsed // 2. We're parsing a WEB link, not some shitty piece of shit // check for date // if span has no text before it, assume it's a date $desc_split = explode( $span["outerHTML"], $description, 2 ); if( $this->fuckhtml ->getTextContent( $desc_split[0] ) == "" ){ // has no text before $date = strtotime($innertext); if($date){ $web["date"] = $date; } // cleanup $description = str_replace( $span["outerHTML"], "", $description ); continue; } // Ready to parse table if(count($desc_split) === 2){ $this->fuckhtml->load($desc_split[1]); $web["table"][ $this->fuckhtml ->getTextContent( trim($desc_split[0], ": ") ) ] = $innertext; // cleanup $description = str_replace( $desc_split[0] . $span["outerHTML"], "", $description ); } } } } $web["description"] = trim( $this->fuckhtml ->getTextContent( $description ), " ·." ); if($web["description"] == ""){ $web["description"] = null; } $out["web"][] = $web; continue; } // // Detect wikipedia shit // $wiki_title = $this->fuckhtml ->getElementsByTagName("h3"); if(count($wiki_title) !== 0){ $description_after = []; $description = []; $table = []; $sublink = []; $as = $this->fuckhtml ->getElementsByTagName("a"); foreach($as as $a){ if( isset($a["attributes"]["href"]) && parse_url($a["attributes"]["href"], PHP_URL_HOST) == "maps.google.com" ){ // detected maps embed, ignore continue 2; } } // get carousels and remove them from container for image grepper $carousels = $this->parsecarousels($container["innerHTML"]); $this->fuckhtml->load($container); // add images to image tab, if applicable for($i=0; $i<count($carousels); $i++){ foreach($carousels[$i] as $item){ if( $item["url"] !== null && $item["ref"] !== null && $item["image"] !== null && $item["title"] !== null ){ $out["image"][] = [ "title" => $item["title"], "source" => [ [ "url" => $item["url"], "width" => $item["image_width"], "height" => $item["image_height"] ], [ "url" => $item["image"], "width" => $item["thumb_width"], "height" => $item["thumb_height"] ] ], "url" => $item["ref"] ]; unset($carousels[$i]); } } } $carousels = array_values($carousels); // interpret remaining carousels as title + carousel $titles = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "font-weight" => "700", "letter-spacing" => "0.75px", "text-transform" => "uppercase" ], self::is_class ) ); for($i=0; $i<count($titles); $i++){ if(!isset($carousels[$i])){ break; } $description_after[] = [ "type" => "title", "value" => $this->fuckhtml ->getTextContent( $titles[$i] ) ]; foreach($carousels[$i] as $carousel){ $description_after[] = [ "type" => "link", "url" => "web?s=" . urlencode($carousel["description"]) . "&scraper=google", "value" => $carousel["description"] ]; if($carousel["subtext"] !== null){ $description_after[] = [ "type" => "quote", "value" => $carousel["subtext"] ]; } $description_after[] = [ "type" => "image", "url" => $carousel["image"] ]; } } $categories = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "padding" => "12px 16px 12px" ], self::is_class ) ); $image = $this->fuckhtml ->getElementsByTagName("img"); if(count($image) !== 0){ $image = $this->getimage($image[0]["attributes"]["id"]); }else{ $image = null; } $url = null; for($i=0; $i<count($categories); $i++){ $this->fuckhtml->load($categories[$i]); if($i === 0){ // first node. this should be the header with the small // information snippet $url = $this->fuckhtml ->getElementsByTagName("a"); if(count($url) !== 0){ $url = $this->unshiturl( $url[0]["attributes"]["href"] ); if(parse_url($url, PHP_URL_HOST) == "encrypted-tbn0.gstatic.com"){ $image = $url; $url = null; } }else{ $url = null; } $categories[$i]["innerHTML"] = str_replace( $wiki_title[0]["outerHTML"], "", $categories[$i]["innerHTML"] ); $subtext = $this->fuckhtml ->getTextContent( $categories[$i]["innerHTML"] ); if(strlen($subtext) !== 0){ $description[] = [ "type" => "quote", "value" => $this->fuckhtml ->getTextContent( $categories[$i]["innerHTML"] ) ]; } // detect audio file $audio = $this->fuckhtml ->getElementsByTagName("audio"); if(count($audio) !== 0){ $description[] = [ "type" => "audio", "url" => $this->fuckhtml ->getTextContent( $audio[0]["attributes"]["src"] ) ]; } }else{ // check for separator elements IN THERE $separators = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "white-space" => "pre-line", "word-wrap" => "break-word" ], self::is_class ), "div" ); // detect container type foreach($separators as $separator){ $this->fuckhtml->load($separator); // ignore wrong levels if($separator["level"] !== 2){ continue; } // // Detect word definition // $wordwraps = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "padding-bottom" => "12px" ], self::is_class ), "div" ); if(count($wordwraps) !== 0){ foreach($wordwraps as $word){ $this->fuckhtml->load($word); // detect title $span = $this->fuckhtml ->getElementsByTagName( "span" ); if( count($span) === 1 && $this->fuckhtml ->getTextContent( str_replace( $span[0]["outerHTML"], "", $word["innerHTML"] ) ) == "" ){ $description[] = [ "type" => "title", "value" => $this->fuckhtml ->getTextContent( $span[0] ) ]; continue; } // detect list element $lists = $this->fuckhtml ->getElementsByTagName("ol"); if(count($lists) !== 0){ foreach($lists as $list){ $this->fuckhtml->load($list); $items = $this->fuckhtml ->getElementsByTagName("li"); $w = 0; foreach($items as $item){ $w++; $this->fuckhtml->load($item); // get subnodes $subnodes = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "white-space" => "pre-line", "word-wrap" => "break-word" ], self::is_class ), "div" ); foreach($subnodes as $subnode){ $this->fuckhtml->load($subnode); $spans = $this->fuckhtml ->getElementsByTagName("span"); if(count($spans) !== 0){ // append quote $description[] = [ "type" => "quote", "value" => $this->fuckhtml ->getTextContent( $subnode ) ]; }else{ // append text $description[] = [ "type" => "text", "value" => $w . ". " . $this->fuckhtml ->getTextContent( $subnode ) ]; } } } } }else{ // parse without list // get subnodes $subnodes = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "white-space" => "pre-line", "word-wrap" => "break-word" ], self::is_class ), "div" ); foreach($subnodes as $subnode){ $this->fuckhtml->load($subnode); $spans = $this->fuckhtml ->getElementsByTagName("span"); if(count($spans) !== 0){ // append quote $description[] = [ "type" => "quote", "value" => $this->fuckhtml ->getTextContent( $subnode ) ]; }else{ // append text $description[] = [ "type" => "text", "value" => $this->fuckhtml ->getTextContent( $subnode ) ]; } } } } }else{ // // Parse table // $spans = $this->fuckhtml ->getElementsByTagName("span"); foreach($spans as $span){ if(!isset($span["attributes"]["class"])){ // found table $row = explode( ":", $this->fuckhtml ->getTextContent( $separator ), 2 ); if(count($row) === 2){ $table[rtrim($row[0])] = ltrim($row[1]); } continue 2; } } // // Parse normal description // $links_rem = $this->fuckhtml ->getElementsByTagName("a"); foreach($links_rem as $rem){ $separator["innerHTML"] = str_replace( $rem["outerHTML"], "", $separator["innerHTML"] ); } $description[] = [ "type" => "text", "value" => rtrim( $this->fuckhtml ->getTextContent( $separator ), " .," ) ]; } } } // detect huge buttons $buttons = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "display" => "table-cell", "vertical-align" => "middle", "height" => "52px", "text-align" => "center" ], self::is_class ), "a" ); if(count($buttons) !== 0){ foreach($buttons as $button){ if(isset($button["attributes"]["href"])){ $sublink[ $this->fuckhtml ->getTextContent( $button ) ] = $this->unshiturl( $button["attributes"]["href"] ); } } } } // append description_after (contains carousel info) $description = array_merge( $description, $description_after ); $out["answer"][] = [ "title" => $this->fuckhtml ->getTextContent( $wiki_title[0] ), "description" => $description, "url" => $url, "thumb" => $image, "table" => $table, "sublink" => $sublink ]; continue; } // // Detect related searches containers // $container_title = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "color" => "#000", "font-size" => "16px", "font-weight" => "bold", "margin" => "0", "padding" => "12px 16px 0px 16px" ], self::is_class ), "div" ); if(count($container_title) !== 0){ // get carousel entries $carousels = $this->parsecarousels($container["innerHTML"]); $this->fuckhtml->load($container); foreach($carousels as $carousel){ foreach($carousel as $item){ if($item["url"] !== null){ $out["related"][] = $item["url"]; } } } $container_title = strtolower( $this->fuckhtml ->getTextContent( $container_title[0] ) ); switch($container_title){ case "related searches": case "people also search for": // // Parse related searches // $as = $this->fuckhtml ->getElementsByTagName("a"); foreach($as as $a){ $out["related"][] = $this->fuckhtml ->getTextContent($a); } break; case "people also ask": // get related queries $divs = $this->fuckhtml ->getElementsByTagName("div"); foreach($divs as $div){ // add accdef's here if($has_appended_accdef === false){ $out["web"] = array_merge($out["web"], $accdefs); $has_appended_accdef = true; } // add accdef's questions if(isset($div["attributes"]["role"])){ $out["related"][] = $this->fuckhtml ->getTextContent($div); continue; } } break; } continue; } // // Parse news // $title = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "font-size" => "16px", "line-height" => "20px", "font-weight" => "400" ], self::is_class ), "div" ); if(count($title) !== 0){ $carousels = $this->parsecarousels(); $this->fuckhtml->load($container); if(count($carousels) === 0){ // no carousels found continue; } $title = strtolower( $this->fuckhtml ->getTextContent( $title[0] ) ); if( preg_match( '/^latest from|^top stories/', $title ) ){ // Found news article foreach($carousels[0] as $carousel){ if($carousel["image"] !== null){ $thumb = [ "url" => $carousel["image"], "ratio" => "16:9" ]; }else{ $thumb = [ "url" => null, "ratio" => null ]; } $out["news"][] = [ "title" => $carousel["title"], "description" => $carousel["description"], "date" => $carousel["date"], "thumb" => $thumb, "url" => $carousel["url"] ]; } } elseif( $title == "images" ){ foreach($carousels as $carousel){ foreach($carousel as $item){ $out["image"][] = [ "title" => $item["title"], "source" => [ [ "url" => $item["url"], "width" => $item["image_width"], "height" => $item["image_height"] ], [ "url" => $item["image"], "width" => $item["thumb_width"], "height" => $item["thumb_height"] ] ], "url" => $item["ref"] ]; } } } continue; } // // Detect nodes with only text + links // // ignore elements with <style> tags $style = $this->fuckhtml ->getElementsByTagName("style"); if(count($style) !== 0){ continue; } $as = $this->fuckhtml ->getElementsByTagName("a"); $description = []; $pcitems = $this->fuckhtml ->getElementsByClassName( "pcitem", "div" ); if(count($pcitems) !== 0){ // ignore elements with carousels in them continue; } foreach($as as $a){ // // Detect next page // if( isset($a["attributes"]["aria-label"]) && strtolower($a["attributes"]["aria-label"]) == "next page" ){ $out["npt"] = $this->backend->store( $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ), $pagetype, $ip ); continue 2; } // // Parse as text node // $container["innerHTML"] = explode( $a["outerHTML"], $container["innerHTML"], 2 ); $before = $this->fuckhtml ->getTextContent( $container["innerHTML"][0], false, false ); // set after if(count($container["innerHTML"]) === 2){ $container["innerHTML"] = $container["innerHTML"][1]; }else{ $container["innerHTML"] = ""; } if($before != ""){ $description[] = [ "type" => "text", "value" => $before ]; } // add link $description[] = [ "type" => "link", "url" => $this->unshiturl( $a["attributes"] ["href"] ), "value" => $this->fuckhtml ->getTextContent( $a ) ]; } if($container["innerHTML"] != ""){ $description[] = [ "type" => "text", "value" => $this->fuckhtml ->getTextContent( $container["innerHTML"] ) ]; } $out["answer"][] = [ "title" => "Notice", "description" => $description, "url" => null, "thumb" => null, "table" => [], "sublink" => [] ]; } // // remove duplicate web links cause instant answers // sometimes contains duplicates // $c = count($out["web"]); $links = []; for($i=0; $i<$c; $i++){ foreach($links as $link){ if($out["web"][$i]["url"] == $link){ unset($out["web"][$i]); continue 2; } } $links[] = $out["web"][$i]["url"]; } $out["web"] = array_values($out["web"]); return $out; } public function image($get){ // generate parameters if($get["npt"]){ [$params, $proxy] = $this->backend->get( $get["npt"], "images" ); $params = json_decode($params, true); }else{ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $lang = $get["lang"]; $time = $get["time"]; $size = $get["size"]; $ratio = $get["ratio"]; $color = $get["color"]; $type = $get["type"]; $format = $get["format"]; $rights = $get["rights"]; $params = [ "q" => $search, "tbm" => "isch" ]; // country if($country != "any"){ $params["gl"] = $country; } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; // language if($lang != "any"){ $params["lr"] = "lang_" . $lang; } $tbs = []; // time if($time != "any"){ $tbs[] = "qrd:" . $time; } // size if($size != "any"){ if( in_array( $size, ["l", "s", "i"] ) ){ $tbs[] = "isz:" . $size; }else{ $tbs[] = "tbz:lt"; $tbs[] = "islt:" . $size; } } // ratio if($ratio != "any"){ $tbs[] = "iar:" . $ratio; } // color if($color != "any"){ if( in_array( $color, ["color", "gray", "trans"] ) ){ $tbs[] = "ic:" . $color; }else{ $tbs[] = "ic:specific"; $tbs[] = "isc:" . $color; } } // type if($type != "any"){ $tbs[] = "itp:" . $type; } // format if($format != "any"){ $tbs[] = "ift:" . $format; } // rights if($rights != "any"){ $tbs[] = "il:" . $rights; } // append tbs if(count($tbs) !== 0){ $params["tbs"] = implode(",", $tbs); } } /* $handle = fopen("scraper/google-img.html", "r"); $html = fread($handle, filesize("scraper/google-img.html")); fclose($handle);*/ // scrape images try{ $html = $this->get( $proxy, "https://www.google.com/search", $params ); }catch(Exception $error){ throw new Exception("Failed to get search page"); } if($error = $this->detect_sorry($html)){ throw new Exception($error); } $out = [ "status" => "ok", "npt" => null, "image" => [] ]; $images = $this->fuckhtml ->getElementsByClassName( "islrtb isv-r", "div" ); foreach($images as $image){ $this->fuckhtml->load($image); $img = $this->fuckhtml ->getElementsByTagName("img")[0]; $og_width = (int)$image["attributes"]["data-ow"]; $og_height = (int)$image["attributes"]["data-oh"]; $thumb_width = (int)$image["attributes"]["data-tw"]; $ratio = $og_width / $og_height; if(isset($img["attributes"]["data-src"])){ $src = &$img["attributes"]["data-src"]; }else{ $src = &$img["attributes"]["src"]; } $thumb_height = floor($thumb_width / $ratio); $out["image"][] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $image["attributes"]["data-pt"] ) ), "source" => [ [ "url" => $this->fuckhtml ->getTextContent( $image["attributes"]["data-ou"] ), "width" => $og_width, "height" => $og_height ], [ "url" => $this->fuckhtml ->getTextContent( $src ), "width" => $thumb_width, "height" => $thumb_height ] ], "url" => $this->fuckhtml ->getTextContent( $image["attributes"]["data-ru"] ) ]; } // get next page // https://www.google.com/search // ?q=higurashi // &tbm=isch // &async=_id%3Aislrg_c%2C_fmt%3Ahtml // &asearch=ichunklite // &ved=0ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA if(count($out["image"]) !== 100){ // no more results return $out; } if($get["npt"]){ // update nextpage information $params["start"] = (int)$params["start"] + count($out["image"]); $params["ijn"] = (int)$params["ijn"] + 1; $out["npt"] = $this->backend->store( json_encode($params), "images", $proxy ); }else{ // scrape nextpage information $this->fuckhtml->load($html); $ved = $this->fuckhtml ->getElementById("islrg", "div"); if($ved){ $ved = $this->fuckhtml ->getTextContent( $ved["attributes"]["data-ved"] ); // &vet=1{$ved}..i (10ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA..i) /* These 2 are handled by us start = start + number of results ijn = current page number */ // &start=100 // &ijn=1 // &imgvl=CAEY7gQgBSj3Aji8VTjXVUC4AUC3AUgAYNdV preg_match( '/var e=\'([A-z0-9]+)\';/', $html, $imgvl ); if(isset($imgvl[1])){ $imgvl = $imgvl[1]; $params["async"] = "_id:islrg_c,_fmt:html"; $params["asearch"] = "ichunklite"; $params["ved"] = $ved; $params["vet"] = "1" . $ved . "..i"; $params["start"] = 100; $params["ijn"] = 1; $params["imgvl"] = $imgvl; $out["npt"] = $this->backend->store( json_encode($params), "images", $proxy ); } } } return $out; } private function hms2int($time){ $parts = explode(":", $time, 3); $time = 0; if(count($parts) === 3){ // hours $time = $time + ((int)$parts[0] * 3600); array_shift($parts); } if(count($parts) === 2){ // minutes $time = $time + ((int)$parts[0] * 60); array_shift($parts); } // seconds $time = $time + (int)$parts[0]; return $time; } private function parsejavascript($html){ $this->fuckhtml->load($html); $styles = $this->fuckhtml ->getElementsByTagName("style"); $this->computedstyle = []; $this->ask = []; foreach($styles as $style){ $this->computedstyle = array_merge( $this->computedstyle, $this->parsestyles($style["innerHTML"]) ); } // get images in javascript var preg_match( '/google\.ldi=({[^}]+})/', $html, $this->js_image ); if(count($this->js_image) !== 0){ $this->js_image = json_decode($this->js_image[1], true); }else{ $this->js_image = []; } // additional js_images present in <script> tags // ugh i fucking hate you $scripts = $this->fuckhtml ->getElementsByTagName("script"); foreach($scripts as $script){ if(!isset($script["innerHTML"])){ continue; } preg_match_all( '/var s=\'(data:image[^\']+)\';var i=\[(\'[^\;]*\')];/', $script["innerHTML"], $image_grep ); if(count($image_grep[0]) !== 0){ $items = explode(",", $image_grep[2][0]); $value = $this->fuckhtml ->getTextContent( $image_grep[1][0] ); foreach($items as $item){ $this->js_image[trim($item, "' ")] = $value; } } // even more javascript crap // "People also ask" node is loaded trough javascript preg_match_all( '/window\.jsl\.dh\(\'([^\']+)\',\'(.+)\'\);/', $script["innerHTML"], $ask_grep ); for($i=0; $i<count($ask_grep[0]); $i++){ $this->ask[trim($ask_grep[1][$i])] = $this->fuckhtml->parseJsString( $ask_grep[2][$i] ); } } } private function findstyles($rules, $is){ $c = count($rules); foreach($this->computedstyle as $classname => $styles){ if($classname[0] != $is){ // not a class, skip continue; } $i = 0; foreach($styles as $stylename => $stylevalue){ if( isset($rules[$stylename]) && $rules[$stylename] == $stylevalue ){ $i++; }else{ continue 2; } } if($c === $i){ return ltrim($classname, $is); } } // fail, did not find classname. return false; } private function parsestyles($style){ // get style tags preg_match_all( '/([^{]+){([^}]*)}/', $style, $tags_regex ); $tags = []; for($i=0; $i<count($tags_regex[0]); $i++){ $tagnames = explode(",", trim($tags_regex[1][$i])); foreach($tagnames as $tagname){ $tagname = trim($tagname); if(!isset($tags[$tagname])){ $tags[$tagname] = []; } $values = explode(";", $tags_regex[2][$i]); foreach($values as $value){ $value = explode(":", $value, 2); if(count($value) !== 2){ continue; } $tags[$tagname][trim($value[0])] = trim($value[1]); } } } return $tags; } private function getimage($id){ if(isset($this->js_image[$id])){ $return = $this->fuckhtml->parseJsString($this->js_image[$id]); if( $return != "" && $return != "" ){ if( preg_match( '/^\/\//', $return ) ){ return 'https:' . $return; } return $return; } return null; } } private function parsecarousels(&$item_to_remove = false){ $carousels = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "padding" => "16px", "position" => "relative" ], self::is_class ) ); $return = []; for($i=0; $i<count($carousels); $i++){ if(!isset($carousels[$i]["outerHTML"])){ continue; } $this->fuckhtml->load($carousels[$i]); if($item_to_remove !== false){ $item_to_remove = str_replace( $carousels[$i]["outerHTML"], "", $item_to_remove ); } $pcitems = $this->fuckhtml ->getElementsByClassName( "pcitem", "div" ); foreach($pcitems as $pcitem){ $this->fuckhtml->load($pcitem); $out = [ "url" => null, "ref" => null, "image" => null, "thumb_width" => null, "thumb_height" => null, "image_width" => null, "image_height" => null, "title" => null, "description" => null, "subtext" => null, "date" => null ]; $url = $this->unshiturl( $this->fuckhtml ->getElementsByTagName("a") [0] ["attributes"] ["href"], true ); // set ref $out["ref"] = $url["ref"]; // set url $out["url"] = $url["url"]; // set sizes $out["thumb_width"] = $url["thumb_width"]; $out["thumb_height"] = $url["thumb_height"]; $out["image_width"] = $url["image_width"]; $out["image_height"] = $url["image_height"]; // get image $out["image"] = $this->fuckhtml ->getElementsByTagName( "img" ); if(count($out["image"]) !== 0){ // get title from image if(isset($out["image"][0]["attributes"]["alt"])){ $out["title"] = $this->titledots( $this->fuckhtml ->getTextContent( $out["image"][0]["attributes"]["alt"] ) ); } // get image url if(isset($out["image"][0]["attributes"]["id"])){ $out["image"] = $this->getimage($out["image"][0]["attributes"]["id"]); } elseif(isset($out["image"][0]["attributes"]["data-ll"])){ $out["image"] = $this->fuckhtml ->getTextContent( $out["image"][0]["attributes"]["data-ll"] ); }else{ // failed to get image information $out["image"] = null; } if($out["image"] == ''){ // found arrow image base64, skip entry continue; } }else{ // Could not find any image in node $out["image"] = null; } // get title from spans $title = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "color" => "#1967d2" ], self::is_class ), "span" ); if(count($title) !== 0){ $out["title"] = $this->fuckhtml ->getTextContent( $title[0] ); } // get textnodes $textnodes = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ "white-space" => "pre-line", "word-wrap" => "break-word" ], self::is_class ) ); $subtext = null; if(count($textnodes) !== 0){ // get date $date = $this->fuckhtml ->getTextContent( $textnodes[count($textnodes) - 1], true ); if(str_replace("\n", " ", $date) == $title){ $date = null; }else{ if(strpos($date, "\n") !== false){ $date = explode("\n", $date); $date = $date[count($date) - 1]; } elseif(strpos($date, "•") !== false){ $date = explode("•", $date); $date = ltrim($date[count($date) - 1]); }else{ $date = null; } } if($date !== null){ $date = strtotime($date); } // get description $description = $this->fuckhtml ->getTextContent( $textnodes[0] ); if($out["title"] === null){ if($date === null){ $out["title"] = $description; $description = null; }else{ $out["title"] = parse_url($out["url"], PHP_URL_HOST); } } if(isset($textnodes[1])){ $out["subtext"] = $this->fuckhtml ->getTextContent( $textnodes[1] ); } }else{ $date = null; $description = null; } $out["date"] = $date; $out["description"] = $this->titledots($description); if($out["url"] === null){ $out["url"] = $out["title"]; } if($out["title"] == $out["description"]){ $out["description"] = null; } $return[$i][] = $out; } } return $return; } private function unshiturl($url, $return_size = false){ // get parameters from URL $url = $this->fuckhtml ->getTextContent($url); $newurl = parse_url($url, PHP_URL_QUERY); if($newurl == ""){ // probably telephone number return $url; } $url = $newurl; unset($newurl); parse_str($url, $query); if(isset($query["imgurl"])){ $url = $query["imgurl"]; } elseif(isset($query["q"])){ $url = $query["q"]; } // rewrite URLs to remove extra tracking parameters $domain = parse_url($url, PHP_URL_HOST); if( preg_match( '/wikipedia.org$/', $domain ) ){ // rewrite wikipedia mobile URLs to desktop $url = $this->replacedomain( $url, preg_replace( '/([a-z0-9]+)(\.m\.)/', '$1.', $domain ) ); } elseif( preg_match( '/imdb\.com$|youtube\.[^.]+$/', $domain ) ){ // rewrite imdb and youtube mobile URLs too $url = $this->replacedomain( $url, preg_replace( '/^m\./', "", $domain ) ); } elseif( preg_match( '/play\.google\.[^.]+$/', $domain ) ){ // remove referrers from play.google.com $oldquery = parse_url($url, PHP_URL_QUERY); if($oldquery !== null){ parse_str($oldquery, $query); if(isset($query["referrer"])){ unset($query["referrer"]); } if(isset($query["hl"])){ unset($query["hl"]); } if(isset($query["gl"])){ unset($query["gl"]); } $query = http_build_query($query); $url = str_replace( $oldquery, $query, $url ); } } elseif( preg_match( '/twitter\.com$/', $domain ) ){ // remove more referrers from twitter.com $oldquery = parse_url($url, PHP_URL_QUERY); if($oldquery !== null){ parse_str($oldquery, $query); if(isset($query["ref_src"])){ unset($query["ref_src"]); } $query = http_build_query($query); if($query != ""){ $query .= "?" . $query; } $url = str_replace( '?' . $oldquery, $query, $url ); } } elseif( preg_match( '/maps\.google\.[^.]+/', $domain ) ){ if(stripos($url, "maps?") !== false){ //https://maps.google.com/maps?daddr=Johnny,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7 $query = parse_url($url, PHP_URL_QUERY); if($query !== null){ parse_str($query, $query); if(isset($query["daddr"])){ $url = "https://maps.google.com/maps?daddr=" . urlencode($query["daddr"]); } } } } if($return_size){ return [ "url" => $url, "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null, "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null, "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null, "image_width" => isset($query["w"]) ? (int)$query["w"] : null, "image_height" => isset($query["h"]) ? (int)$query["h"] : null ]; } return $url; } private function replacedomain($url, $domain){ return preg_replace( '/(https?:\/\/)([^\/]+)/', '$1' . $domain, $url ); } private function titledots($title){ return rtrim($title, ". \t\n\r\0\x0B"); } private function detect_sorry($html){ $this->fuckhtml->load($html); $detect_sorry = $this->fuckhtml ->getElementsByTagName("title"); if( isset($detect_sorry[0]) && $detect_sorry[0]["innerHTML"] == "302 Moved" ){ // may be consent.google.com in europe or /sorry captcha page $url = $this->fuckhtml ->getElementsByTagName("a"); if( strpos( parse_url( $this->fuckhtml ->getTextContent( $url[0]["attributes"]["href"] ), PHP_URL_PATH ), "/sorry" ) === 0 ){ // found /sorry return "Google blocked this 4get instance. Please setup a proxy!"; } // found consent.google, should not happen anymore return "Google served a GPDR consent form. This should not happen, please report if you encounter this message"; } return false; } }