From 3e2c3fc5d90a0b0f859358d1c5b2614ab26905a4 Mon Sep 17 00:00:00 2001 From: lolcat <will@lolcat.ca> Date: Wed, 2 Apr 2025 21:40:53 -0400 Subject: [PATCH] fixed google videos --- scraper/google.php | 913 ++++++++++----------------------------------- 1 file changed, 207 insertions(+), 706 deletions(-) diff --git a/scraper/google.php b/scraper/google.php index d2c9eda..b3b3b13 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -578,697 +578,6 @@ class google{ } - private function parsepage($html, $pagetype, $search, $proxy, $params){ - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - $this->fuckhtml->load($html); - - $this->detect_sorry(); - - // parse all <style> tags - $this->parsestyles(); - - // get javascript images - $this->scrape_dimg($html); - - // get html blobs - preg_match_all( - '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/', - $html, - $blobs - ); - - $this->blobs = []; - if(isset($blobs[1])){ - - for($i=0; $i<count($blobs[1]); $i++){ - - $this->blobs[$blobs[1][$i]] = - $this->fuckhtml - ->parseJsString( - $blobs[2][$i] - ); - } - } - - $this->scrape_imagearr($html); - - // - // load result column - // - - $result_div = - $this->fuckhtml - ->getElementById( - "center_col", - "div" - ); - - if($result_div === false){ - - throw new Exception("Failed to grep result div"); - } - - $this->fuckhtml->load($result_div); - - // important for later - $last_page = false; - - // - // Get text results - // - $results = - $this->fuckhtml - ->getElementsByClassName( - "g", - "div" - ); - - $this->skip_next = false; - - foreach($results as $result){ - - if($this->skip_next){ - - $this->skip_next = false; - continue; - } - - $this->fuckhtml->load($result); - - $web = [ - "title" => null, - "description" => null, - "url" => null, - "date" => null, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - - // Detect presence of sublinks - $g = - $this->fuckhtml - ->getElementsByClassName( - "g", - "div" - ); - - if(count($g) > 0){ - - // skip on next iteration - $this->skip_next = true; - } - - // get title - $h3 = - $this->fuckhtml - ->getElementsByTagName( - "h3" - ); - - if(count($h3) === 0){ - - continue; - } - - $web["title"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $h3[0] - ) - ); - - // get url - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - $web["url"] = - $this->unshiturl( - $as[0] - ["attributes"] - ["href"] - ); - - if( - !preg_match( - '/^http/', - $web["url"] - ) - ){ - - // skip if invalid url is found - continue; - } - - // - // get viewcount, time posted and follower count from <cite> tag - // - $cite = - $this->fuckhtml - ->getElementsByTagName( - "cite" - ); - - if(count($cite) !== 0){ - - $this->fuckhtml->load($cite[0]); - - $spans = - $this->fuckhtml - ->getElementsByTagName("span"); - - if(count($spans) === 0){ - - $cites = - explode( - "·", - $this->fuckhtml - ->getTextContent( - $cite[0] - ) - ); - - foreach($cites as $cite){ - - $cite = trim($cite); - - if( - preg_match( - '/(.+) (views|followers|likes)$/', - $cite, - $match - ) - ){ - - $web["table"][ucfirst($match[2])] = - $match[1]; - }elseif( - preg_match( - '/ago$/', - $cite - ) - ){ - - $web["date"] = - strtotime($cite); - } - } - } - - // reset - $this->fuckhtml->load($result); - } - - // - // attempt to fetch description cleanly - // - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "-webkit-line-clamp:2" - ); - - if(count($description) !== 0){ - - $web["description"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - }else{ - - // use ANOTHER method where the description is a header of the result - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "wa:/description" - ); - - if(count($description) !== 0){ - - // get date off that shit - $date = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "font-size" => "12px", - "line-height" => "1.34", - "display" => "inline-block", - "font-family" => "google sans,arial,sans-serif", - "padding-right" => "0", - "white-space" => "nowrap" - ] - ), - "span" - ); - - if(count($date) !== 0){ - - $description[0]["innerHTML"] = - str_replace( - $date[0]["outerHTML"], - "", - $description[0]["innerHTML"] - ); - - $web["date"] = - strtotime( - $this->fuckhtml - ->getTextContent( - $date[0] - ) - ); - } - - $web["description"] = - $this->fuckhtml - ->getTextContent( - $description[0] - ); - }else{ - - // Yes.. You guessed it, use ANOTHER method to get descriptions - // off youtube containers - $description = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "-webkit-box-orient" => "vertical", - "display" => "-webkit-box", - "font-size" => "14px", - "-webkit-line-clamp" => "2", - "line-height" => "22px", - "overflow" => "hidden", - "word-break" => "break-word", - "color" => "#4d5156" - ] - ), - "div" - ); - - if(count($description) !== 0){ - - // check for video duration - $duration = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "background-color" => "rgba(0,0,0,0.6)", - "color" => "#fff", - "fill" => "#fff" - ] - ), - "div" - ); - - if(count($duration) !== 0){ - - $web["table"]["Duration"] = - $this->fuckhtml - ->getTextContent( - $duration[0] - ); - } - - $web["description"] = - $this->titledots( - html_entity_decode( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ) - ); - - // get author + time posted - $info = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "color" => "var(" . $this->getcolorvar("#70757a") . ")", - "font-size" => "14px", - "line-height" => "20px", - "margin-top" => "12px" - ] - ), - "div" - ); - - if(count($info) !== 0){ - - $info = - explode( - "·", - $this->fuckhtml - ->getTextContent( - $info[0] - ) - ); - - switch(count($info)){ - - case 3: - $web["table"]["Author"] = trim($info[1]); - $web["date"] = strtotime(trim($info[2])); - break; - - case 2: - $web["date"] = strtotime(trim($info[1])); - break; - } - } - } - } - } - - // - // get categories of content within the search result - // - $cats = - $this->fuckhtml - ->getElementsByAttributeName( - "data-sncf", - "div" - ); - - foreach($cats as $cat){ - - $this->fuckhtml->load($cat); - - // detect image category - $images = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - if(count($images) !== 0){ - - foreach($images as $image){ - - if(isset($image["attributes"]["id"])){ - // we found an image - - if(isset($image["attributes"]["width"])){ - - $width = (int)$image["attributes"]["width"]; - - if($width == 110){ - - $ratio = "1:1"; - }elseif($width > 110){ - - $ratio = "16:9"; - }else{ - - $ratio = "9:16"; - } - }else{ - - $ratio = "1:1"; - } - - $web["thumb"] = [ - "url" => $this->getdimg($image["attributes"]["id"]), - "ratio" => $ratio - ]; - - continue 2; - } - } - } - - // Detect rating - $spans_unfiltered = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - $spans = - $this->fuckhtml - ->getElementsByAttributeName( - "aria-label", - $spans_unfiltered - ); - - foreach($spans as $span){ - - if( - preg_match( - '/^Rated/', - $span["attributes"]["aria-label"] - ) - ){ - - // found rating - // scrape rating - preg_match( - '/([0-9.]+).*([0-9.]+)/', - $span["attributes"]["aria-label"], - $rating - ); - - if(isset($rating[1])){ - - $web["table"]["Rating"] = - $rating[1] . "/" . $rating[2]; - } - - $has_seen_reviews = 0; - foreach($spans_unfiltered as $span_unfiltered){ - - if( - preg_match( - '/([0-9,.]+) +([A-z]+)$/', - $this->fuckhtml - ->getTextContent( - $span_unfiltered - ), - $votes - ) - ){ - - $has_seen_reviews++; - $web["table"][ucfirst($votes[2])] = $votes[1]; - continue; - } - - $text = - $this->fuckhtml - ->getTextContent( - $span_unfiltered - ); - - if( - $text == " " || - $text == "" - ){ - - break; - } - - switch($has_seen_reviews){ - - case 1: - // scrape price - $web["table"]["Price"] = $text; - $has_seen_reviews++; - break; - - case 2: - // scrape platform - $web["table"]["Platform"] = $text; - $has_seen_reviews++; - break; - - case 3: - // Scrape type - $web["table"]["Medium"] = $text; - break; - } - } - - continue 2; - } - } - - // check if its an answer header - $answer_header = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "overflow" => "hidden", - "text-overflow" => "ellipsis" - ] - ), - "span" - ); - - if(count($answer_header) !== 0){ - - $link = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - $cat["innerHTML"] = - str_replace( - $link[0]["outerHTML"], - "", - $cat["innerHTML"] - ); - - continue; - } - - // we probed everything, assume this is the description - // if we didn't find one cleanly previously - if($web["description"] === null){ - $web["description"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $cat - ) - ); - } - } - - // check if description contains date - $description = explode("—", $web["description"], 2); - - if( - count($description) === 2 && - strlen($description[0]) <= 20 - ){ - - $date = strtotime($description[0]); - - if($date !== false){ - - $web["date"] = $date; - $web["description"] = ltrim($description[1]); - } - } - - // fetch youtube thumbnail - $thumbnail = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "border-radius" => "8px", - "height" => "fit-content", - "justify-content" => "center", - "margin-right" => "20px", - "margin-top" => "4px", - "position" => "relative", - "width" => "fit-content" - ] - ), - "div" - ); - - if(count($thumbnail) !== 0){ - - // load thumbnail container - $this->fuckhtml->load($thumbnail[0]); - - $image = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - if( - count($image) !== 0 && - isset($image[0]["attributes"]["id"]) - ){ - - $web["thumb"] = [ - "url" => - $this->unshit_thumb( - $this->getdimg( - $image[0]["attributes"]["id"] - ) - ), - "ratio" => "16:9" - ]; - } - - // reset - $this->fuckhtml->load($result); - } - - $out["web"][] = $web; - } - - // reset - $this->fuckhtml->load($result_div); - - // - // craft $npt token - // - if( - $last_page === false && - count($out["web"]) !== 0 - ){ - if(!isset($params["start"])){ - - $params["start"] = 20; - }else{ - - $params["start"] += 20; - } - - $out["npt"] = - $this->backend - ->store( - json_encode($params), - $pagetype, - $proxy - ); - } - - return $out; - } - - private function scrape_dimg($html){ // get images loaded through javascript @@ -2554,8 +1863,6 @@ class google{ [$params, $proxy] = $this->backend->get($get["npt"], "video"); $params = json_decode($params, true); - $search = $params["q"]; - }else{ $search = $get["s"]; $country = $get["country"]; @@ -2569,9 +1876,9 @@ class google{ $params = [ "q" => $search, - "tbm" => "vid", + "udm" => "7", "hl" => "en", - "num" => "20" + "num" => 20 ]; // country @@ -2637,12 +1944,35 @@ class google{ throw new Exception("Failed to get HTML"); } - //$html = file_get_contents("scraper/google.html"); + if(!isset($params["start"])){ + + $params["start"] = 0; + } + $params["start"] += 20; + + $this->fuckhtml->load($html); + + // + // Parse web video page + // + $this->detect_sorry(); + + // parse all <style> tags + $this->parsestyles(); + + // get javascript images + $this->scrape_dimg($html); + + $this->scrape_imagearr($html); - $response = $this->parsepage($html, "videos", $search, $proxy, $params); $out = [ "status" => "ok", - "npt" => $response["npt"], + "npt" => + $this->backend->store( + json_encode($params), + "videos", + $proxy + ), "video" => [], "author" => [], "livestream" => [], @@ -2650,21 +1980,192 @@ class google{ "reel" => [] ]; - foreach($response["web"] as $result){ + $search_div = + $this->fuckhtml + ->getElementById( + "center_col" + ); + + if($search_div === false){ + + throw new Exception("Failed to grep search div"); + } + + $this->fuckhtml->load($search_div); + + $results = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "margin" => "0px 0px 30px" + ]), + "div" + ); + + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $url = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($url) === 0){ + + // no url, weird, continue + continue; + } + + $title = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($title) === 0){ + + // no title, weird, continue + continue; + } + + // get description + $description = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "-webkit-box-orient" => "vertical", + "display" => "-webkit-box", + "-webkit-line-clamp" => "2", + "overflow" => "hidden", + "word-break" => "break-word" + ]), + "div" + ); + + if(count($description) === 0){ + + $description = null; + }else{ + + $description = + html_entity_decode( + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ); + } + + // get author + date posted + $metadiv = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "margin-top" => "12px" + ]), + "div" + ); + + $author = null; + $date = null; + + if(count($metadiv) !== 0){ + + $metadiv = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $metadiv[0] + ) + ); + + if(count($metadiv) === 3){ + + $author = trim($metadiv[1]); + $date = strtotime(trim($metadiv[2])); + }elseif(count($metadiv) === 2){ + + $author = trim($metadiv[0]); + $date = strtotime(trim($metadiv[1])); + } + } + + $thumb = [ + "url" => null, + "ratio" => null + ]; + + $image = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + $duration = null; + + if( + count($image) !== 0 && + isset($image[0]["attributes"]["id"]) + ){ + + $thumb = [ + "url" => $this->getdimg($image[0]["attributes"]["id"]), + "ratio" => "16:9" + ]; + + // get duration + $duration = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "background-color" => "rgba(0,0,0,0.6)", + "color" => "#fff", + "fill" => "#fff" + ]) + ); + + if(count($duration) !== 0){ + + $duration = + $this->hms2int( + $this->fuckhtml + ->getTextContent( + $duration[0] + )); + }else{ + + $duration = null; + } + } $out["video"][] = [ - "title" => $result["title"], - "description" => $result["description"], + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => $description, "author" => [ - "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null, + "name" => $author, "url" => null, "avatar" => null ], - "date" => $result["date"], - "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null, + "date" => $date, + "duration" => $duration, "views" => null, - "thumb" => $result["thumb"], - "url" => $result["url"] + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $url[0]["attributes"]["href"] + ) ]; }