new google message
This commit is contained in:
@@ -12,6 +12,8 @@ class google{
|
|||||||
|
|
||||||
include "lib/backend.php";
|
include "lib/backend.php";
|
||||||
$this->backend = new backend("google");
|
$this->backend = new backend("google");
|
||||||
|
|
||||||
|
$this->message = "Still working on a Google scraper that uses a headful browser. It will require Firefox + a webExtension running on a dedicated server. Waiting for my EDID adapter and we can get the show going. In the meantime, use the Google CSE/API or Yahoo JP/Startpage scrapers. They're all crippled in their own special ways but they're serviceable I guess.";
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getfilters($page){
|
public function getfilters($page){
|
||||||
@@ -505,7 +507,7 @@ class google{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private function get($proxy, $url, $get = [], $alt_ua = false){
|
private function get($proxy, $url, $get = []){
|
||||||
|
|
||||||
$curlproc = curl_init();
|
$curlproc = curl_init();
|
||||||
|
|
||||||
@@ -518,35 +520,22 @@ class google{
|
|||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
|
|
||||||
if($alt_ua === true){
|
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
|
"User-Agent: " . config::USER_AGENT,
|
||||||
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
"Accept: text/html, application/xml;q=0.9, */*;q=0.8",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"Accept-Language: en-US,en;q=0.8",
|
"Accept-Encoding: gzip",
|
||||||
"Accept-Encoding: gzip, deflate",
|
"DNT: 1",
|
||||||
"Connection: Keep-Alive",
|
"Connection: keep-alive",
|
||||||
"Cache-Control: no-cache"
|
"Upgrade-Insecure-Requests: 1",
|
||||||
]);
|
"Sec-Fetch-Dest: document",
|
||||||
}else{
|
"Sec-Fetch-Mode: navigate",
|
||||||
|
"Sec-Fetch-Site: none",
|
||||||
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
"Sec-Fetch-User: ?1",
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
|
"Priority: u=1",
|
||||||
"User-Agent: " . config::USER_AGENT,
|
"TE: trailers"
|
||||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
]);
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
|
||||||
"Accept-Encoding: gzip",
|
|
||||||
"DNT: 1",
|
|
||||||
"Connection: keep-alive",
|
|
||||||
"Upgrade-Insecure-Requests: 1",
|
|
||||||
"Sec-Fetch-Dest: document",
|
|
||||||
"Sec-Fetch-Mode: navigate",
|
|
||||||
"Sec-Fetch-Site: none",
|
|
||||||
"Sec-Fetch-User: ?1",
|
|
||||||
"Priority: u=1",
|
|
||||||
"TE: trailers"
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||||
@@ -574,228 +563,22 @@ class google{
|
|||||||
|
|
||||||
public function web($get){
|
public function web($get){
|
||||||
|
|
||||||
throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
|
throw new Exception($this->message);
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public function video($get){
|
|
||||||
throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public function news($get){
|
|
||||||
throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public function image($get){
|
public function image($get){
|
||||||
|
throw new Exception($this->message);
|
||||||
|
}
|
||||||
|
|
||||||
// generate parameters
|
|
||||||
if($get["npt"]){
|
|
||||||
|
|
||||||
[$params, $proxy] =
|
public function video($get){
|
||||||
$this->backend->get(
|
throw new Exception($this->message);
|
||||||
$get["npt"],
|
}
|
||||||
"images"
|
|
||||||
);
|
|
||||||
|
|
||||||
$params = json_decode($params, true);
|
|
||||||
|
|
||||||
$page = $params["page"] + 1;
|
public function news($get){
|
||||||
$params = $params["params"];
|
throw new Exception($this->message);
|
||||||
$params["async"] = "_fmt:json,p:1,ijn:{$page}";
|
|
||||||
|
|
||||||
}else{
|
|
||||||
|
|
||||||
$search = $get["s"];
|
|
||||||
if(strlen($search) === 0){
|
|
||||||
|
|
||||||
throw new Exception("Search term is empty!");
|
|
||||||
}
|
|
||||||
|
|
||||||
$proxy = $this->backend->get_ip();
|
|
||||||
$country = $get["country"];
|
|
||||||
$nsfw = $get["nsfw"];
|
|
||||||
$time = $get["time"];
|
|
||||||
$size = $get["size"];
|
|
||||||
$ratio = $get["ratio"];
|
|
||||||
$color = $get["color"];
|
|
||||||
$type = $get["type"];
|
|
||||||
$format = $get["format"];
|
|
||||||
$rights = $get["rights"];
|
|
||||||
|
|
||||||
$page = 0;
|
|
||||||
|
|
||||||
$params = [
|
|
||||||
"q" => $search,
|
|
||||||
"tbm" => "isch",
|
|
||||||
"asearch" => "isch",
|
|
||||||
"async" => "_fmt:json,p:0,ijn:{$page}", // ijn:0 = page 1
|
|
||||||
];
|
|
||||||
|
|
||||||
// country (image search uses cr instead of gl)
|
|
||||||
if($country != "any"){
|
|
||||||
|
|
||||||
$params["cr"] = "country" . strtoupper($country);
|
|
||||||
}
|
|
||||||
|
|
||||||
// nsfw
|
|
||||||
$params["safe"] = $nsfw == "yes" ? "off" : "active";
|
|
||||||
|
|
||||||
// generate tbs
|
|
||||||
$tbs = [];
|
|
||||||
|
|
||||||
// time
|
|
||||||
if($time != "any"){
|
|
||||||
|
|
||||||
$tbs["qdr"] = $time;
|
|
||||||
}
|
|
||||||
|
|
||||||
// size
|
|
||||||
if($size != "any"){
|
|
||||||
|
|
||||||
$params["imgsz"] = $size;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ratio
|
|
||||||
if($ratio != "any"){
|
|
||||||
|
|
||||||
$params["imgar"] = $ratio;
|
|
||||||
}
|
|
||||||
|
|
||||||
// color
|
|
||||||
if($color != "any"){
|
|
||||||
|
|
||||||
if(
|
|
||||||
$color == "color" ||
|
|
||||||
$color == "trans"
|
|
||||||
){
|
|
||||||
|
|
||||||
$params["imgc"] = $color;
|
|
||||||
}elseif($color == "bnw"){
|
|
||||||
|
|
||||||
$params["imgc"] = "gray";
|
|
||||||
}else{
|
|
||||||
|
|
||||||
$tbs["ic"] = "specific";
|
|
||||||
$tbs["isc"] = $color;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// type
|
|
||||||
if($type != "any"){
|
|
||||||
|
|
||||||
$tbs["itp"] = $type;
|
|
||||||
}
|
|
||||||
|
|
||||||
// format
|
|
||||||
if($format != "any"){
|
|
||||||
|
|
||||||
$params["as_filetype"] = $format;
|
|
||||||
}
|
|
||||||
|
|
||||||
// rights (tbs)
|
|
||||||
if($rights != "any"){
|
|
||||||
|
|
||||||
$tbs["sur"] = $rights;
|
|
||||||
}
|
|
||||||
|
|
||||||
// append tbs
|
|
||||||
if(count($tbs) !== 0){
|
|
||||||
|
|
||||||
$params["tbs"] = "";
|
|
||||||
|
|
||||||
foreach($tbs as $key => $value){
|
|
||||||
|
|
||||||
$params["tbs"] .= $key . ":" . $value . ",";
|
|
||||||
}
|
|
||||||
|
|
||||||
$params["tbs"] = rtrim($params["tbs"], ",");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try{
|
|
||||||
$json =
|
|
||||||
$this->get(
|
|
||||||
$proxy,
|
|
||||||
"https://www.google.com/search",
|
|
||||||
$params
|
|
||||||
);
|
|
||||||
}catch(Exception $error){
|
|
||||||
|
|
||||||
throw new Exception("Failed to get search page");
|
|
||||||
}
|
|
||||||
|
|
||||||
unset($params["async"]);
|
|
||||||
|
|
||||||
//$json = file_get_contents("scraper/google.json");
|
|
||||||
|
|
||||||
// detect captcha
|
|
||||||
$this->fuckhtml->load($json);
|
|
||||||
$this->detect_sorry();
|
|
||||||
|
|
||||||
// remove xssi
|
|
||||||
$json =
|
|
||||||
preg_replace(
|
|
||||||
'/^[^{]*/',
|
|
||||||
"",
|
|
||||||
$json
|
|
||||||
);
|
|
||||||
|
|
||||||
$json = json_decode($json, true);
|
|
||||||
|
|
||||||
if($json === null){
|
|
||||||
|
|
||||||
throw new Exception("Failed to decode JSON");
|
|
||||||
}
|
|
||||||
|
|
||||||
$out = [
|
|
||||||
"status" => "ok",
|
|
||||||
"npt" => null,
|
|
||||||
"image" => []
|
|
||||||
];
|
|
||||||
|
|
||||||
if(!isset($json["ischj"]["metadata"])){
|
|
||||||
|
|
||||||
throw new Exception("Google did not return an image array");
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach($json["ischj"]["metadata"] as $image){
|
|
||||||
|
|
||||||
$out["image"][] = [
|
|
||||||
"title" => $this->titledots($image["result"]["page_title"]),
|
|
||||||
"source" => [
|
|
||||||
[
|
|
||||||
"url" => $image["original_image"]["url"],
|
|
||||||
"width" => (int)$image["original_image"]["width"],
|
|
||||||
"height" => (int)$image["original_image"]["height"]
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"url" => $image["thumbnail"]["url"],
|
|
||||||
"width" => (int)$image["thumbnail"]["width"],
|
|
||||||
"height" => (int)$image["thumbnail"]["height"]
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"url" => $image["result"]["referrer_url"]
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
$page++;
|
|
||||||
|
|
||||||
if(count($out["image"]) === 10){
|
|
||||||
|
|
||||||
$out["npt"] =
|
|
||||||
$this->backend->store(
|
|
||||||
json_encode([
|
|
||||||
"params" => $params,
|
|
||||||
"page" => $page
|
|
||||||
]),
|
|
||||||
"images",
|
|
||||||
$proxy
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user