From f73b5f0298f06b44c5cd8a84e327b8e1d7d4ea95 Mon Sep 17 00:00:00 2001 From: lolcat Date: Wed, 18 Jun 2025 10:30:31 -0400 Subject: [PATCH] fix yandex web --- scraper/yandex.php | 89 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 5 deletions(-) diff --git a/scraper/yandex.php b/scraper/yandex.php index 7af8781..f73c3fd 100644 --- a/scraper/yandex.php +++ b/scraper/yandex.php @@ -14,7 +14,7 @@ class yandex{ // backend included in the scraper functions } - private function get($proxy, $url, $get = [], $nsfw){ + private function get($proxy, $url, $get = [], $nsfw, $get_cookie = 1){ $curlproc = curl_init(); @@ -25,19 +25,55 @@ class yandex{ curl_setopt($curlproc, CURLOPT_URL, $url); + // extract "i" cookie + if($get_cookie === 0){ + + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + } + switch($nsfw){ case "yes": $nsfw = "0"; break; case "maybe": $nsfw = "1"; break; case "no": $nsfw = "2"; break; } + switch($get_cookie){ + + case 0: + $cookie = ""; + break; + + case 1: + $cookie = "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw; + break; + + default: + $cookie = "Cookie: i=" . $get_cookie; + } + $headers = ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", "DNT: 1", - "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw, + $cookie, "Referer: https://yandex.com/images/search", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", @@ -59,6 +95,17 @@ class yandex{ $data = curl_exec($curlproc); + if($get_cookie === 0){ + + if(isset($cookies_tmp["i"])){ + + return $cookies_tmp["i"]; + }else{ + + throw new Exception("Failed to get Yandex clearance cookie"); + } + } + if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); @@ -217,6 +264,23 @@ class yandex{ // https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712 // &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023 + // get clearance cookie + if(($cookie = apcu_fetch("yandexweb_cookie")) === false){ + + $proxy = $this->backend->get_ip(); + + $cookie = + $this->get( + $proxy, + "https://yandex.ru/support2/smart-captcha/ru/", + [], + false, + 0 + ); + + apcu_store("yandexweb_cookie", $cookie); + } + if($get["npt"]){ [$npt, $proxy] = $this->backend->get($get["npt"], "web"); @@ -226,7 +290,8 @@ class yandex{ $proxy, "https://yandex.com" . $npt, [], - "yes" + "yes", + $cookie ); }else{ @@ -236,7 +301,7 @@ class yandex{ throw new Exception("Search term is empty!"); } - $proxy = $this->backend->get_ip(); + $proxy = !isset($proxy) ? $this->backend->get_ip() : $proxy; $lang = $get["lang"]; $older = $get["older"]; $newer = $get["newer"]; @@ -283,7 +348,8 @@ class yandex{ $proxy, "https://yandex.com/search/site/", $params, - "yes" + "yes", + $cookie ); }catch(Exception $error){ @@ -314,6 +380,19 @@ class yandex{ $this->fuckhtml->load($html); + // Scrape page blocked error + $title = + $this->fuckhtml + ->getElementsByTagName("title"); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "403" + ){ + + throw new Exception("Yandex blocked this proxy or 4get instance."); + } + // get nextpage $npt = $this->fuckhtml