new google message

2026-04-25 23:03:42 -04:00
parent e63a17d6db
commit b1f5974e40
1 changed files with 31 additions and 248 deletions
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -12,6 +12,8 @@ class google{
 		include "lib/backend.php";
 		$this->backend = new backend("google");
 		$this->message = "Still working on a Google scraper that uses a headful browser. It will require Firefox + a webExtension running on a dedicated server. Waiting for my EDID adapter and we can get the show going. In the meantime, use the Google CSE/API or Yahoo JP/Startpage scrapers. They're all crippled in their own special ways but they're serviceable I guess.";
 	}
 	public function getfilters($page){
@@ -505,7 +507,7 @@ class google{
 		}
 	}
-	private function get($proxy, $url, $get = [], $alt_ua = false){
+	private function get($proxy, $url, $get = []){
 		$curlproc = curl_init();
@@ -518,35 +520,22 @@ class google{
 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
-		if($alt_ua === true){
+		curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
-			
+		curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
-			curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
+			"User-Agent: " . config::USER_AGENT,
-				"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
+			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-				"Accept: text/html, application/xml;q=0.9, */*;q=0.8",
+			"Accept-Language: en-US,en;q=0.5",
-				"Accept-Language: en-US,en;q=0.8",
+			"Accept-Encoding: gzip",
-				"Accept-Encoding: gzip, deflate",
+			"DNT: 1",
-				"Connection: Keep-Alive",
+			"Connection: keep-alive",
-				"Cache-Control: no-cache"
+			"Upgrade-Insecure-Requests: 1",
-			]);
+			"Sec-Fetch-Dest: document",
-		}else{
+			"Sec-Fetch-Mode: navigate",
-			
+			"Sec-Fetch-Site: none",
-			curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+			"Sec-Fetch-User: ?1",
-			curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
+			"Priority: u=1",
-				"User-Agent: " . config::USER_AGENT,
+			"TE: trailers"
-				"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+		]);
 				"Accept-Language: en-US,en;q=0.5",
 				"Accept-Encoding: gzip",
 				"DNT: 1",
 				"Connection: keep-alive",
 				"Upgrade-Insecure-Requests: 1",
 				"Sec-Fetch-Dest: document",
 				"Sec-Fetch-Mode: navigate",
 				"Sec-Fetch-Site: none",
 				"Sec-Fetch-User: ?1",
 				"Priority: u=1",
 				"TE: trailers"
 			]);
 		}
 		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
@@ -574,228 +563,22 @@ class google{
 	public function web($get){
-		throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
+		throw new Exception($this->message);
 	}
 	public function video($get){
 		throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
 	}
 	public function news($get){
 		throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
 	}
 	public function image($get){
 		throw new Exception($this->message);
 	}
 		// generate parameters
 		if($get["npt"]){
-			[$params, $proxy] =
+	public function video($get){
-				$this->backend->get(
+		throw new Exception($this->message);
-					$get["npt"],
+	}
 					"images"
 				);
 			$params = json_decode($params, true);
-			$page = $params["page"] + 1;
+	public function news($get){
-			$params = $params["params"];
+		throw new Exception($this->message);
 			$params["async"] = "_fmt:json,p:1,ijn:{$page}";
 		}else{
 			$search = $get["s"];
 			if(strlen($search) === 0){
 				throw new Exception("Search term is empty!");
 			}
 			$proxy = $this->backend->get_ip();
 			$country = $get["country"];
 			$nsfw = $get["nsfw"];
 			$time = $get["time"];
 			$size = $get["size"];
 			$ratio = $get["ratio"];
 			$color = $get["color"];
 			$type = $get["type"];
 			$format = $get["format"];
 			$rights = $get["rights"];
 			$page = 0;
 			$params = [
 				"q" => $search,
 				"tbm" => "isch",
 				"asearch" => "isch",
 				"async" => "_fmt:json,p:0,ijn:{$page}", // ijn:0 = page 1
 			];
 			// country (image search uses cr instead of gl)
 			if($country != "any"){
 				$params["cr"] = "country" . strtoupper($country);
 			}
 			// nsfw
 			$params["safe"] = $nsfw == "yes" ? "off" : "active";
 			// generate tbs
 			$tbs = [];
 			// time
 			if($time != "any"){
 				$tbs["qdr"] = $time;
 			}
 			// size
 			if($size != "any"){
 				$params["imgsz"] = $size;
 			}
 			// ratio
 			if($ratio != "any"){
 				$params["imgar"] = $ratio;
 			}
 			// color
 			if($color != "any"){
 				if(
 					$color == "color" ||
 					$color == "trans"
 				){
 					$params["imgc"] = $color;
 				}elseif($color == "bnw"){
 					$params["imgc"] = "gray";
 				}else{
 					$tbs["ic"] = "specific";
 					$tbs["isc"] = $color;
 				}
 			}
 			// type
 			if($type != "any"){
 				$tbs["itp"] = $type;
 			}
 			// format
 			if($format != "any"){
 				$params["as_filetype"] = $format;
 			}
 			// rights (tbs)
 			if($rights != "any"){
 				$tbs["sur"] = $rights;
 			}
 			// append tbs
 			if(count($tbs) !== 0){
 				$params["tbs"] = "";
 				foreach($tbs as $key => $value){
 					$params["tbs"] .= $key . ":" . $value . ",";
 				}
 				$params["tbs"] = rtrim($params["tbs"], ",");
 			}
 		}
 		try{
 			$json = 
 				$this->get(
 					$proxy,
 					"https://www.google.com/search",
 					$params
 				);
 		}catch(Exception $error){
 			throw new Exception("Failed to get search page");
 		}
 		unset($params["async"]);
 		//$json = file_get_contents("scraper/google.json");
 		// detect captcha
 		$this->fuckhtml->load($json);
 		$this->detect_sorry();
 		// remove xssi
 		$json =
 			preg_replace(
 				'/^[^{]*/',
 				"",
 				$json
 			);
 		$json = json_decode($json, true);
 		if($json === null){
 			throw new Exception("Failed to decode JSON");
 		}
 		$out = [
 			"status" => "ok",
 			"npt" => null,
 			"image" => []
 		];
 		if(!isset($json["ischj"]["metadata"])){
 			throw new Exception("Google did not return an image array");
 		}
 		foreach($json["ischj"]["metadata"] as $image){
 			$out["image"][] = [
 				"title" => $this->titledots($image["result"]["page_title"]),
 				"source" => [
 					[
 						"url" => $image["original_image"]["url"],
 						"width" => (int)$image["original_image"]["width"],
 						"height" => (int)$image["original_image"]["height"]
 					],
 					[
 						"url" => $image["thumbnail"]["url"],
 						"width" => (int)$image["thumbnail"]["width"],
 						"height" => (int)$image["thumbnail"]["height"]
 					]
 				],
 				"url" => $image["result"]["referrer_url"]
 			];
 		}
 		$page++;
 		if(count($out["image"]) === 10){
 			$out["npt"] =
 				$this->backend->store(
 					json_encode([
 						"params" => $params,
 						"page" => $page
 					]),
 					"images",
 					$proxy
 				);
 		}
 		return $out;
 	}