diff --git a/lib/frontend.php b/lib/frontend.php index a454bab..04b08b6 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -943,6 +943,7 @@ class frontend{ "google" => "Google", "google_api" => "Google API", "google_cse" => "Google CSE", + "yahoo_japan" => "Yahoo! JAPAN", "startpage" => "Startpage", "qwant" => "Qwant", "ghostery" => "Ghostery", @@ -970,6 +971,7 @@ class frontend{ "brave" => "Brave", "google" => "Google", "google_cse" => "Google CSE", + "yahoo_japan" => "Yahoo! JAPAN", "startpage" => "Startpage", "qwant" => "Qwant", "yep" => "Yep", @@ -1001,6 +1003,7 @@ class frontend{ "brave" => "Brave", "yandex" => "Yandex", "google" => "Google", + "yahoo_japan" => "Yahoo! JAPAN", "startpage" => "Startpage", "qwant" => "Qwant", "baidu" => "Baidu", @@ -1017,6 +1020,7 @@ class frontend{ "ddg" => "DuckDuckGo", "brave" => "Brave", "google" => "Google", + "yahoo_japan" => "Yahoo! JAPAN", "startpage" => "Startpage", "qwant" => "Qwant", "yep" => "Yep", diff --git a/scraper/yahoo_japan.php b/scraper/yahoo_japan.php new file mode 100644 index 0000000..2406fad --- /dev/null +++ b/scraper/yahoo_japan.php @@ -0,0 +1,1162 @@ +backend = new backend("yahoo_japan"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = [], $return_cookies = false, $is_xhr = false, $cookie = null){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($cookie !== null){ + + $c = []; + foreach($cookie as $name => $value){ + + $c[] = "{$name}=$value"; + } + + $cookie = implode("; ", $c); + } + + if($is_xhr){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://search.yahoo.co.jp/", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: " . $cookie, + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + if($return_cookies){ + + // extract cookies + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + + } + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + if($return_cookies){ + + return [ + "cookies" => $cookies_tmp, + "body" => $data + ]; + } + + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$url, $proxy] = $this->backend->get($get["npt"], "web"); + $params = []; + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $url = "https://search.yahoo.co.jp/search"; + $params = [ + "p" => $get["s"] + ]; + } + + try{ + $html = $this->get( + $proxy, + $url, + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/yahoo_japan.html"); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $json_object = + explode( + '