fix yandex web

This commit is contained in:
lolcat 2025-06-18 10:30:31 -04:00
parent 3e1487e614
commit f73b5f0298
1 changed files with 84 additions and 5 deletions

View File

@ -14,7 +14,7 @@ class yandex{
// backend included in the scraper functions // backend included in the scraper functions
} }
private function get($proxy, $url, $get = [], $nsfw){ private function get($proxy, $url, $get = [], $nsfw, $get_cookie = 1){
$curlproc = curl_init(); $curlproc = curl_init();
@ -25,19 +25,55 @@ class yandex{
curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_URL, $url);
// extract "i" cookie
if($get_cookie === 0){
$cookies_tmp = [];
curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
$length = strlen($header);
$header = explode(":", $header, 2);
if(trim(strtolower($header[0])) == "set-cookie"){
$cookie_tmp = explode("=", trim($header[1]), 2);
$cookies_tmp[trim($cookie_tmp[0])] =
explode(";", $cookie_tmp[1], 2)[0];
}
return $length;
});
}
switch($nsfw){ switch($nsfw){
case "yes": $nsfw = "0"; break; case "yes": $nsfw = "0"; break;
case "maybe": $nsfw = "1"; break; case "maybe": $nsfw = "1"; break;
case "no": $nsfw = "2"; break; case "no": $nsfw = "2"; break;
} }
switch($get_cookie){
case 0:
$cookie = "";
break;
case 1:
$cookie = "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw;
break;
default:
$cookie = "Cookie: i=" . $get_cookie;
}
$headers = $headers =
["User-Agent: " . config::USER_AGENT, ["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding: gzip", "Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5", "Accept-Language: en-US,en;q=0.5",
"DNT: 1", "DNT: 1",
"Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw, $cookie,
"Referer: https://yandex.com/images/search", "Referer: https://yandex.com/images/search",
"Connection: keep-alive", "Connection: keep-alive",
"Upgrade-Insecure-Requests: 1", "Upgrade-Insecure-Requests: 1",
@ -59,6 +95,17 @@ class yandex{
$data = curl_exec($curlproc); $data = curl_exec($curlproc);
if($get_cookie === 0){
if(isset($cookies_tmp["i"])){
return $cookies_tmp["i"];
}else{
throw new Exception("Failed to get Yandex clearance cookie");
}
}
if(curl_errno($curlproc)){ if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc)); throw new Exception(curl_error($curlproc));
@ -217,6 +264,23 @@ class yandex{
// https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712 // https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712
// &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023 // &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023
// get clearance cookie
if(($cookie = apcu_fetch("yandexweb_cookie")) === false){
$proxy = $this->backend->get_ip();
$cookie =
$this->get(
$proxy,
"https://yandex.ru/support2/smart-captcha/ru/",
[],
false,
0
);
apcu_store("yandexweb_cookie", $cookie);
}
if($get["npt"]){ if($get["npt"]){
[$npt, $proxy] = $this->backend->get($get["npt"], "web"); [$npt, $proxy] = $this->backend->get($get["npt"], "web");
@ -226,7 +290,8 @@ class yandex{
$proxy, $proxy,
"https://yandex.com" . $npt, "https://yandex.com" . $npt,
[], [],
"yes" "yes",
$cookie
); );
}else{ }else{
@ -236,7 +301,7 @@ class yandex{
throw new Exception("Search term is empty!"); throw new Exception("Search term is empty!");
} }
$proxy = $this->backend->get_ip(); $proxy = !isset($proxy) ? $this->backend->get_ip() : $proxy;
$lang = $get["lang"]; $lang = $get["lang"];
$older = $get["older"]; $older = $get["older"];
$newer = $get["newer"]; $newer = $get["newer"];
@ -283,7 +348,8 @@ class yandex{
$proxy, $proxy,
"https://yandex.com/search/site/", "https://yandex.com/search/site/",
$params, $params,
"yes" "yes",
$cookie
); );
}catch(Exception $error){ }catch(Exception $error){
@ -314,6 +380,19 @@ class yandex{
$this->fuckhtml->load($html); $this->fuckhtml->load($html);
// Scrape page blocked error
$title =
$this->fuckhtml
->getElementsByTagName("title");
if(
count($title) !== 0 &&
$title[0]["innerHTML"] == "403"
){
throw new Exception("Yandex blocked this proxy or 4get instance.");
}
// get nextpage // get nextpage
$npt = $npt =
$this->fuckhtml $this->fuckhtml