google quote on quote fix

This commit is contained in:
2025-10-08 00:42:36 -04:00
parent 4b16fd5897
commit a4a44709b4
6 changed files with 789 additions and 463 deletions

View File

@@ -561,466 +561,7 @@ class google{
public function web($get){
if($get["npt"]){
[$params, $proxy] = $this->backend->get($get["npt"], "web");
$params = json_decode($params, true);
try{
$json =
$this->get(
$proxy,
"https://www.google.com/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
}else{
$search = $get["s"];
$country = $get["country"];
$nsfw = $get["nsfw"];
$lang = $get["lang"];
$older = $get["older"];
$newer = $get["newer"];
$spellcheck = $get["spellcheck"];
$proxy = $this->backend->get_ip();
$offset = 0;
/*
https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
https://www.google.com/search?udm=14&
yv=3&
q=asmr&
biw=1920&
bih=947&
start=0&
sa=N&
asearch=arc&
cs=1&
async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
*/
$params = [
"udm" => 14,
"yv" => 3,
"q" => $search,
"biw" => 1920,
"bih" => 947,
"start" => 0,
"sa" => "N",
"asearch" => "arc",
"cs" => 1,
"async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc",
"hl" => "en",
"num" => 20
];
// country
if($country != "any"){
$params["gl"] = $country;
}
// nsfw
$params["safe"] = $nsfw == "yes" ? "off" : "active";
// language
if($lang != "any"){
$params["lr"] = "lang_" . $lang;
}
// generate tbs
$tbs = [];
// get date
$older = $older === false ? null : date("m/d/Y", $older);
$newer = $newer === false ? null : date("m/d/Y", $newer);
if(
$older !== null ||
$newer !== null
){
$tbs["cdr"] = "1";
$tbs["cd_min"] = $newer;
$tbs["cd_max"] = $older;
}
// spellcheck filter
if($spellcheck == "no"){
$params["nfpr"] = "1";
}
if(count($tbs) !== 0){
$params["tbs"] = "";
foreach($tbs as $key => $value){
$params["tbs"] .= $key . ":" . $value . ",";
}
$params["tbs"] = rtrim($params["tbs"], ",");
}
try{
$json =
$this->get(
$proxy,
"https://www.google.com/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
//$json = file_get_contents("scraper/google.js");
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$this->fuckhtml->load($json);
$this->detect_sorry();
// get next page
/*
$npt =
$this->fuckhtml
->getElementsByAttributeName(
"data-state-token",
"div"
);
if(count($npt) !== 0){
$params["sstk"] =
$this->fuckhtml
->getTextContent(
$npt[0]["attributes"]["data-state-token"]
);
$params["start"] += 10;
$out["npt"] =
$this->backend->store(
json_encode($params),
"web",
$proxy
);
}*/
// get invididual results
$results =
$this->fuckhtml
->getElementsByAttributeName(
"data-hveid",
"div"
);
foreach($results as $result){
$this->fuckhtml->load($result);
//echo $result["innerHTML"];
$snfs =
$this->fuckhtml
->getElementsByAttributeName(
"data-snf",
"div"
);
$title = null;
$description = null;
$link = null;
$sublinks = [];
$date = null;
$thumb = [
"ratio" => null,
"url" => null
];
$table = [];
// probe for title
$title_node =
$this->fuckhtml
->getElementsByTagName(
"h3"
);
if(count($title_node) !== 0){
// found a title node
$title =
$this->fuckhtml
->getTextContent(
$title_node[0]
);
}
if($title === null){
// should not happen
continue;
}
foreach($snfs as $snf){
$this->fuckhtml->load($snf);
// probe for thumbnail
$thumbnail =
$this->fuckhtml
->getElementsByAttributeName(
"alt",
"img"
);
foreach($thumbnail as $t){
if(
isset($t["attributes"]["style"]) &&
preg_match(
'/height ?: ?([0-9]+)px/',
$t["attributes"]["style"],
$match
) &&
(int)$match[1] < 40
){
// found a favicon, ignore
continue;
}
$thumb = [
"ratio" => "1:1",
"url" =>
$this->fuckhtml
->getTextContent(
$thumbnail[0]["attributes"]["src"]
)
];
continue 2;
}
// probe for description
if($description === null){
// probe 1
if(
isset($snf["attributes"]["data-sncf"]) &&
$snf["attributes"]["data-sncf"] == "1,2"
){
$description =
$this->fuckhtml
->getTextContent(
$snf
);
continue;
}
// probe 2
$desc_probe =
$this->fuckhtml
->getElementsByAttributeValue(
"style",
"-webkit-line-clamp:2",
"div"
);
if(count($desc_probe) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$desc_probe[0]
);
continue;
}
}
// probe for links
$links =
$this->fuckhtml
->getElementsByAttributeName(
"data-sb",
"a"
);
if(isset($links[0]["attributes"]["data-ved"])){
// found the page link
$link =
$this->fuckhtml
->getTextContent(
$links[0]["attributes"]["href"]
);
continue;
}
if(count($links) !== 0){
// get all sublinks
for($i=0; $i<count($links); $i++){
$sublinks[] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$links[$i]
)
),
"description" => null,
"date" => null,
"url" =>
$this->fuckhtml
->getTextContent(
$links[$i]["attributes"]["href"]
)
];
}
continue;
}
// get tabloid-able data
$tabloid =
$this->fuckhtml
->getElementsByAttributeValue(
"style",
"margin-top:0px",
"div"
);
if(count($tabloid) === 0){
// try getting <cite> instead
$tabloid =
$this->fuckhtml
->getElementsByTagName(
"cite"
);
}
if(count($tabloid) !== 0){
// found table
$tabloid =
explode("·", $tabloid[0]["innerHTML"]);
foreach($tabloid as $tbl){
$preg =
$this->fuckhtml
->getTextContent(
$tbl
);
//$table[random_int(0,1000)] = $preg;
if(
// match price
preg_match(
'/(\p{Sc}[^\p{Sc}]+)/',
$preg,
$match
)
){
$table["Price"] = trim($match[1]);
}
if(
// match in stock/delivery
preg_match(
'/(stock|delivery|returns)/i',
$preg,
$match
)
){
$table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0");
}
}
continue;
}
}
// extract date from description
$description_split =
explode(
"", $description, 2
);
if(count($description_split) === 1){
$description = $description_split[0];
}elseif(strlen($description_split[0]) < 17){
$date = strtotime($description_split[0]);
if($date !== false){
$description = $description_split[1];
}else{
$date = null;
}
}
$out["web"][] = [
"title" => $this->titledots($title),
"description" => $this->titledots($description),
"url" => $link,
"date" => $date,
"type" => "web",
"thumb" => $thumb,
"sublink" => $sublinks,
"table" => $table
];
}
// get next page
if(count($out["web"]) > 5){
$params["start"] += 10;
$out["npt"] =
$this->backend->store(
json_encode($params),
"web",
$proxy
);
}
return $out;
throw new Exception("Google made it impossible to scrape web results without a JavaScript runtime. In the meantime, use the Google API or the Google CSE scrapers.");
}