diff --git a/scraper/google.php b/scraper/google.php index 2f71e0e..73fd7a4 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -573,1148 +573,17 @@ class google{ public function web($get){ - // it broke again. lasted 3 months - // lets hope for another solid 3 month - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - if($get["npt"]){ - - [$get, $proxy] = $this->backend->get($get["npt"], "web"); - - try{ - $html = - $this->get( - $proxy, - "https://www.google.com" . $get, - [], - true - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - }else{ - - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; - $spellcheck = $get["spellcheck"]; - $proxy = $this->backend->get_ip(); - - $offset = 0; - - $params = [ - "q" => $search, - "hl" => "en", - "udm" => 14 - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } - - // generate tbs - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // spellcheck filter - if($spellcheck == "no"){ - - $params["nfpr"] = "1"; - } - - if(count($tbs) !== 0){ - - $params["tbs"] = ""; - - foreach($tbs as $key => $value){ - - $params["tbs"] .= $key . ":" . $value . ","; - } - - $params["tbs"] = rtrim($params["tbs"], ","); - } - - try{ - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params, - true - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - //$html = file_get_contents("scraper/google.html"); - } - - // init - $this->fuckhtml->load($html); - $this->detect_sorry(); - $this->parsestyles(); - - // get javascript images - $this->scrape_dimg($html); - $this->scrape_imagearr($html); - - // get next page - $npt = - $this->fuckhtml - ->getElementsByAttributeValue( - "aria-label", - "More search results", - "a" - ); - - if(count($npt) === 0){ - - // maybe we have the npt object from 2nd page, probe for that - $npt = - $this->fuckhtml - ->getElementsByAttributeValue( - "aria-label", - "Next page", - "a" - ); - } - - if(count($npt) !== 0){ - - $out["npt"] = - $this->backend->store( - $this->fuckhtml - ->getTextContent( - $npt[0]["attributes"]["href"] - ), - "web", - $proxy - ); - } - - // outer div is .MjjYud - // inner div always contain role="presentation" - - $outer = - $this->fuckhtml - ->getElementsByClassName( - "MjjYud", - "div" - ); - - // used later - $fancycontainer_class = - explode( - " ", - $this->getstyle([ - "padding-top" => "4px", - "padding-bottom" => "calc(12px*1)" - ]), - 2 - ); - - if(count($fancycontainer_class) === 2){ - - $fancycontainer_class = $fancycontainer_class[1]; - }else{ - - $fancycontainer_class = false; - } - - foreach($outer as $container){ - - $this->fuckhtml->load($container); - - // probe for search result - $title = - $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "link", - "div" - ); - - if(count($title) !== 0){ - - // we found a search result - - $title = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ); - - // get url - $sprobe = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - $link = null; - - foreach($sprobe as $possible_link){ - - if( - isset($possible_link["attributes"]["href"]) && - preg_match( - '/^\/url\?q=/', - $possible_link["attributes"]["href"] - ) - ){ - - $link = - $this->fuckhtml - ->getTextContent( - $possible_link["attributes"]["href"] - ); - - break; - } - } - - if($link === null){ - - // should not happen - continue; - } - - // get description - // as usual, theres a thousand fucking possible divs for this one - - // probe for youtube-like description - $description = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "align-items" => "flex-start", - "display" => "flex", - "justify-content" => "center", - "padding" => "7px 12px", - "padding-right" => "0", - "padding-top" => "0" - ]), - "div" - ); - - $ratio = "16:9"; - - if(count($description) === 0){ - - // fail. find the one with the image on the right handside - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:12px", - "div" - ); - - $ratio = "1:1"; - - if(count($description) === 0){ - - // fail. find the one that is used the most - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "-webkit-line-clamp:3", - "div" - ); - - if(count($description) === 0){ - - // last fail. this one appears with divs that have prices - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "max-width:100vw;grid-area:nke7rc;padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:6px", - "div" - ); - } - } - } - - if(count($description) === 0){ - - // should not happen but whatever - $description = null; - }else{ - - $description = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - } - - // probe for date - $desc2 = explode("—", $description, 2); - - $time = null; - - if(count($desc2) === 2){ - - $time = strtotime($desc2[0]); - - if( - strlen($desc2[0]) < 16 && - $time !== false - ){ - - $description = ltrim($desc2[1]); - }else{ - - $time = null; - } - } - - $thumb = [ - "ratio" => null, - "url" => null, - ]; - - // get thumbnail - $images = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - foreach($images as $image){ - - if(isset($image["attributes"]["id"])){ - - $thumb = [ - "ratio" => $ratio, - "url" => $this->getdimg($image["attributes"]["id"]) - ]; - } - } - - // get sublinks - $sublinks = []; - - // probe for the fancy version - if($fancycontainer_class !== false){ - $fancycontainer = - $this->fuckhtml - ->getElementsByClassName( - $fancycontainer_class, - "div" - ); - } - - if( - $fancycontainer_class !== false && - count($fancycontainer) !== 0 - ){ - - $this->fuckhtml->load($fancycontainer[0]); - - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - foreach($as as $a){ - - $sublinks[] = [ - "title" => - $this->fuckhtml - ->getTextContent( - $a - ), - "description" => null, - "date" => null, - "url" => - $this->unshiturl( - $a["attributes"]["href"] - ) - ]; - } - } - - $out["web"][] = [ - "title" => $title, - "description" => $description, - "url" => $this->unshiturl($link), - "date" => $time, - "type" => "web", - "thumb" => $thumb, - "sublink" => $sublinks, - "table" => [] - ]; - continue; - } - - // probe for containers with a title header - $title_header = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "display" => "flex", - "flex-wrap" => "wrap", - "position" => "relative", - "padding" => "16px" - ]) - ); - - if(count($title_header) !== 0){ - - $title_header = - strtolower( - $this->fuckhtml - ->getTextContent( - $title_header[0] - ) - ); - - switch($title_header){ - - case "people also search for": - // get all related searches - $relateds = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "display" => "flex", - "height" => "100%", - "flex-direction" => "column", - "max-width" => "100%" - ]) - ); - - foreach($relateds as $r){ - - $out["related"][] = - $this->fuckhtml - ->getTextContent( - $r - ); - } - break; - } - - continue; - } - } - - $out["related"] = array_values(array_unique($out["related"])); - - return $out; + throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now."); } public function video($get){ - - if($get["npt"]){ - - [$params, $proxy] = $this->backend->get($get["npt"], "video"); - $params = json_decode($params, true); - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $older = $get["older"]; - $newer = $get["newer"]; - $duration = $get["duration"]; - $quality = $get["quality"]; - $captions = $get["captions"]; - $proxy = $this->backend->get_ip(); - - $params = [ - "q" => $search, - "udm" => "7", - "hl" => "en", - "num" => 20 - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // duration - if($duration != "any"){ - - $tbs[] = "dur:" . $duration; - } - - // quality - if($quality != "any"){ - - $tbs[] = "hq:" . $quality; - } - - // captions - if($captions != "any"){ - - $tbs[] = "cc:" . $captions; - } - - // append tbs - if(count($tbs) !== 0){ - - $params["tbs"] = - implode(",", $tbs); - } - } - - try{ - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - if(!isset($params["start"])){ - - $params["start"] = 0; - } - $params["start"] += 20; - - $this->fuckhtml->load($html); - - // - // Parse web video page - // - $this->detect_sorry(); - - // parse all