From f43feff0aa5d56ee3f75618cf4ab50fa325e263e Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 27 Jul 2025 21:46:03 -0400 Subject: [PATCH] added baidu, the best search engine --- lib/frontend.php | 6 +- lib/fuckhtml.php | 16 +- scraper/baidu.php | 2229 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2248 insertions(+), 3 deletions(-) create mode 100644 scraper/baidu.php diff --git a/lib/frontend.php b/lib/frontend.php index f49cfdd..954210d 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -949,6 +949,7 @@ class frontend{ "crowdview" => "Crowdview", "mwmbl" => "Mwmbl", "mojeek" => "Mojeek", + "baidu" => "Baidu", "solofield" => "Solofield", "marginalia" => "Marginalia", "wiby" => "wiby", @@ -969,6 +970,7 @@ class frontend{ "startpage" => "Startpage", "qwant" => "Qwant", "yep" => "Yep", + "baidu" => "Baidu", "solofield" => "Solofield", "pinterest" => "Pinterest", "flickr" => "Flickr", @@ -993,6 +995,7 @@ class frontend{ "google" => "Google", "startpage" => "Startpage", "qwant" => "Qwant", + "baidu" => "Baidu", "solofield" => "Solofield" ] ]; @@ -1008,7 +1011,8 @@ class frontend{ "startpage" => "Startpage", "qwant" => "Qwant", "yep" => "Yep", - "mojeek" => "Mojeek" + "mojeek" => "Mojeek", + "baidu" => "Baidu" ] ]; break; diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php index 5b45578..3ea256f 100644 --- a/lib/fuckhtml.php +++ b/lib/fuckhtml.php @@ -240,12 +240,13 @@ class fuckhtml{ public function getElementsByFuzzyAttributeValue(string $name, string $value, $collection = null){ $elems = $this->getElementsByAttributeName($name, $collection); + $value = explode( " ", trim( preg_replace( - '/ +/', + '/\s+/', " ", $value ) @@ -258,7 +259,18 @@ class fuckhtml{ foreach($elem["attributes"] as $attrib_name => $attrib_value){ - $attrib_value = explode(" ", $attrib_value); + $attrib_value = + explode( + " ", + trim( + preg_replace( + '/\s+/', + " ", + $attrib_value + ) + ) + ); + $ac = count($attrib_value); $nc = count($value); $cr = 0; diff --git a/scraper/baidu.php b/scraper/baidu.php new file mode 100644 index 0000000..efb14ca --- /dev/null +++ b/scraper/baidu.php @@ -0,0 +1,2229 @@ +backend = new backend("baidu"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + $this->handles = []; + $this->proc = null; + $this->handle_category = null; + $this->handle_increment = 0; + $this->sublink_increment = 0; + + $this->cookie = null; + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return + [ + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ] + ]; + break; + + case "images": + return + [ + "sort" => [ + "display" => "Sort", + "option" => [ + "relevance" => "Relevance", // no param + "latest" => "Latest", // &latest=1 + "hot" => "Hot" // &hot=1 + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "7" => "Extra large (1080px+)", // &z=7 + "6" => "Large (600px~1080px)", // &z=6 + "5" => "Medium (300px~600px)", // &z=5 + "4" => "Small (1px~300px)" // &z=4 + ] + ], + "ratio" => [ + "display" => "Ratio", + "option" => [ + "any" => "Any ratio", + "1" => "Tall vertical", // &imgratio=1 + "2" => "Vertical", // &imgratio=2 + "3" => "Square", // &imgratio=3 + "4" => "Horizontal", // &imgratio=4 + "5" => "Wide horizontal" // &imgratio=5 + ] + ], + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any format", + "3" => "JPG", // &imgformat=3 + "5" => "JPEG", // &imgformat=5 + "4" => "PNG", // &imgformat=4 + "2" => "BMP", // &imgformat=2 + "6" => "GIF (Animated)" // &imgformat=6 + ] + ], + "color" => [ + "display" => "Color", + "option" => [ + "any" => "Any color", + "1024" => "White", // &ic=1024 + "2048" => "Black & White", + "512" => "Black", + "64" => "Magenta", + "16" => "Blue", + "1" => "Red", + "2" => "Yellow", + "32" => "Purple", + "4" => "Green", + "8" => "Teal", + "256" => "Orange", + "128" => "Brown" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "any" => "Any type", + "hd" => "HD", // &hd=1 + "isImgSet" => "Photo album", // &isImgSet=1 + "copyright" => "Copyright" // ©right=1 + ] + ] + ]; + break; + + case "videos": + return []; + break; + + case "news": + return [ + "category" => [ + "display" => "Category", + "option" => [ + "any" => "All news", + "media" => "Media websites", // &medium=1 + "baijiahao" => "Baidu Baijiahao" // &medium=2 + ] + ] + ]; + break; + } + } + + private function get($proxy, $url, $get = [], $referer = false){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($referer === false){ + if($this->cookie === null){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=0, i"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: {$this->cookie}", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=0, i"] + ); + } + }else{ + + if($this->cookie === null){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: {$referer}", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: {$referer}", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: {$this->cookie}", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin"] + ); + } + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + // store cookie + if(strlen($this->cookie) !== 0){ + + $this->cookie .= "; "; + } + + foreach($cookies_tmp as $cookie_name => $cookie_value){ + + $this->cookie .= $cookie_name . "=" . $cookie_value . "; "; + } + + $this->cookie = rtrim($this->cookie, " ;"); + + curl_close($curlproc); + return $data; + } + + private function redirect_add_url($proxy, $url){ + + if( + preg_match( + '/^https?:\/\/(?:www\.)?baidu\.com\/link\?/', + $url + ) === 0 + ){ + + // not a baidu redirect + return; + } + + $curlproc = curl_init(); + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=0, i"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + curl_setopt($curlproc, CURLOPT_HEADER, true); + curl_setopt($curlproc, CURLOPT_NOBODY, true); + + $this->backend->assign_proxy($curlproc, $proxy); + + curl_multi_add_handle($this->proc, $curlproc); + $this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc; + } + + private function resolve_urls($proxy, &$collection, $categories){ + + $this->proc = curl_multi_init(); + curl_multi_select($this->proc); + + foreach($categories as $category){ + + $this->sublink_increment = 0; + $this->handle_increment = 0; + $this->handle_category = $category; + + foreach($collection[$category] as $item){ + + $this->sublink_increment = 0; + $this->redirect_add_url($proxy, $item["url"]); + + if(isset($item["sublink"])){ + + foreach($item["sublink"] as $sublink){ + + $this->sublink_increment++; + $this->redirect_add_url($proxy, $sublink["url"]); + } + } + + $this->handle_increment++; + } + } + + do{ + $status = curl_multi_exec($this->proc, $active); + + }while($active && $status == CURLM_OK); + + // + // if we reach this, we're done downloading garbage + // + + foreach($this->handles as $category => $v){ + + foreach($v as $index => $data){ + + foreach($this->handles[$category][$index] as $sublinkindex => $handle){ + + preg_match( + '/location: ?(.*)$/im', + curl_multi_getcontent($handle), + $location + ); + + if(isset($location[1])){ + + if($sublinkindex === 0){ + + $collection[$category][$index]["url"] = trim($location[1]); + }else{ + + $collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]); + } + } + + curl_multi_remove_handle($this->proc, $handle); + curl_close($handle); + } + } + } + + curl_multi_close($this->proc); + } + + private function resolve_images($proxy, &$data){ + + // get the image viewer that contains all of the images direct URLs + // for some reason, getting the second image's url in the set + // doesnt trigger the captcha + + if( + !isset($data["image"][1]["url"]) || + preg_match( + '/^https:\/\/image\.baidu\.com\/search\/detail/', + $data["image"][1]["url"] + ) === 0 + ){ + + // we have an already resolved image link, do nothing + return; + } + + try{ + + $html = + $this->get( + $proxy, + $data["image"][1]["url"], + [] + ); + }catch(Exception $error){ + + // fallback to the limited dataset we have + return; + } + + $this->fuckhtml->load($html); + + $script = + $this->fuckhtml + ->getElementById( + "image-detail-data", + "script" + ); + + if($script){ + + $json = + json_decode( + $script["innerHTML"], + true + ); + + if( + !isset($json["data"]["images"]) || + count($json["data"]["images"]) === 0 + ){ + + // do nothing + return; + } + + // + // Discard all previously scraped images and use data + // from the newly downloaded image carousel + // the imageset !!should!! be the same + // + $data["image"] = []; + + foreach($json["data"]["images"] as $image){ + + parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); + + $data["image"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $image["titleShow"] + ), + "source" => [ + [ + "url" => $image["objurl"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ // thumbnail + "url" => $image["thumburl"], + "width" => (int)$thumb_size["w"], + "height" => (int)$thumb_size["h"] + ] + ], + "url" => $image["fromUrl"] + ]; + } + } + } + + public function web($get){ + + if($get["npt"]){ + + [$json, $proxy] = $this->backend->get($get["npt"], "web"); + + $json = json_decode($json, true); + $this->cookie = $json["cookie"]; + $npt_data = $json["req"]; + + $npt_data["pn"] = $npt_data["pn"] + 20; + + try{ + + $html = $this->get( + $proxy, + "https://www.baidu.com/s", + $npt_data + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + + // + // Get authentication token + // + $proxy = $this->backend->get_ip(); + + // running this will give us shit in $this->cookie + // @TODO probably not needed? I get blocked anyways ffs + //$this->get($proxy, "https://www.baidu.com", []); + + $npt_data = [ + "wd" => $get["s"], + "rn" => 20 + ]; + + // &gpc=stf%3D0%2C1752638400|stftype%3D2 + if( + $get["older"] !== false || + $get["newer"] !== false + ){ + + if($get["older"] === false){ + + $get["older"] = 0; + } + + $npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2"; + } + + try{ + + $html = $this->get( + $proxy, + "https://www.baidu.com/s", + $npt_data + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + $npt_data["pn"] = 0; + } + + return $this->parse_search($proxy, "web", $npt_data, $html); + } + + private function parse_search($proxy, $pagetype, $npt_data, $html){ + + // @HACK + // remove newlines from the html, cause it fucks with fuckhtml + $html = str_replace(["\n", "\r"], "", $html); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_ass(); + + $datafields = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "div" + ); + + // + // Get next page + // + $npt = + $this->fuckhtml + ->getElementsByClassName( + "n", + "a" + ); + + if(count($npt) !== 0){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "req" => $npt_data, + "cookie" => $this->cookie + ]), + $pagetype, + $proxy + ); + } + + // + // Get related searches + // + $related_container = + $this->fuckhtml + ->getElementById( + "rs_new", + $datafields + ); + + if($related_container){ + + $this->fuckhtml->load($related_container); + + $as = + $this->fuckhtml + ->getElementsByClassName( + "c-color-link", + "a" + ); + + foreach($as as $a){ + + $text = + explode( + ">", + $this->fuckhtml + ->getTextContent( + $a + ), + 2 + ); + + $out["related"][] = $text[count($text) - 1]; + } + } + + foreach($datafields as $datafield){ + + if( + !isset($datafield["attributes"]["id"]) || + preg_match( + '/^[0-9]+$/', + $datafield["attributes"]["id"] + ) === 0 + ){ + + // not a search result + continue; + } + + $this->fuckhtml->load($datafield); + $div = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + // + // Don't parse as a search result if it's a card + // + $card = + $this->fuckhtml + ->getElementsByClassName( + "cosc-card", + $div + ); + + if(count($card) !== 0){ + + // + // Parse chinese youtube shorts + // + $ytshorts_probe = + $this->fuckhtml + ->getElementsByClassName( + "tts-b-item", + $div + ); + + if(count($ytshorts_probe) !== 0){ + + $videos = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-show", + "list", + $div + ); + + foreach($videos as $video){ + + $this->fuckhtml->load($video); + + $title = + $this->fuckhtml + ->getElementsByClassName( + "cosc-title-slot", + "span" + ); + + if(count($title) === 0){ + + continue; + } + + $url = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($url) === 0){ + + continue; + } + + $image = + $this->fuckhtml + ->getElementsByClassName( + "cos-image-body", + "img" + ); + + if(count($image) === 0){ + + $image = [ + "ratio" => null, + "url" => null + ]; + }else{ + + $image = [ + "ratio" => "1:1", + "url" => + $this->fuckhtml + ->getTextContent( + $image[0]["attributes"]["src"] + ) + ]; + } + + // get duration + $divs = + $this->fuckhtml + ->getElementsByAttributeName( + "class", + "div" + ); + + $duration = null; + foreach($divs as $probe){ + + if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){ + + $duration = + $this->hms2int( + $this->fuckhtml + ->getTextContent( + $probe + ) + ); + break; + } + } + + $out["video"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $title[0] + ), + "description" => null, + "date" => null, + "duration" => $duration, + "views" => null, + "thumb" => $image, + "url" => + $this->fuckhtml + ->getTextContent( + $url[0]["attributes"]["href"] + ) + ]; + } + } + + // + // Parse image carousel + // + $is_image_carousel = false; + foreach($div as $d){ + + if( + isset($d["attributes"]["class"]) && + strpos($d["attributes"]["class"], "image-container") !== false + ){ + + $is_image_carousel = true; + break; + } + } + + if($is_image_carousel){ + + preg_match( + '//U', + $datafield["innerHTML"], + $matches + ); + + if(isset($matches[1])){ + + // weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong + $json = + $this->fuckhtml + ->parseJsObject( + str_replace( + "-\-", + "--", + $matches[1] + ) + ); + + if( + $json !== null && + isset($json["imageList"][0]["images"]) + ){ + + // parse image carousel + foreach($json["imageList"][0]["images"] as $image){ + + parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); + + $out["image"][] = [ + "title" => "image", + "source" => [ + [ + "url" => $image["objurl"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ // thumbnail + "url" => $image["thumburl"], + "width" => (int)$thumb_size["w"], + "height" => (int)$thumb_size["h"] + ] + ], + "url" => $image["jumpUrl"] + ]; + } + } + } + } + continue; + } + + if(!isset($datafield["attributes"]["mu"])){ + + // dont scrape if we dont have the direct link + continue; + } + + // class:FYB_RD -> News garbage, IGNORE + + $result = + $this->fuckhtml + ->getElementsByClassName( + "result", + [$datafield] + ); + + if(count($result) !== 0){ + + // + // Parse normal search result + // + + $title = + $this->fuckhtml + ->getElementsByClassName( + "sc-link", + "a" + ); + + if(count($title) === 0){ + + // should not happen + continue; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + + $description = + $this->fuckhtml + ->getElementsByClassName( + "c-color", + $div + ); + + if(count($description) !== 0){ + + $this->fuckhtml->load($description[0]); + + $description = + $this->fuckhtml + ->getElementsByAttributeName( + "class", + "span" + ); + + $found_desc = false; + foreach($description as $desc){ + + if(stripos($desc["attributes"]["class"], "summary-text") !== false){ + + $found_desc = true; + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $desc + ) + ); + break; + } + } + + if($found_desc === false){ + + $description = null; + } + + $this->fuckhtml->load($datafield); + }else{ + + $description = null; + } + + // parse date + $date_probe = + $this->fuckhtml + ->getElementsByClassName( + "cos-color-text-minor", + "span" + ); + + if(count($date_probe) !== 0){ + + $date = + $this->parse_time( + $this->fuckhtml + ->getTextContent( + $date_probe[0] + ) + ); + }else{ + + $date = null; + } + + // parse image + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($img) !== 0){ + + $image = [ + "ratio" => "16:9", + "url" => + $this->unfuckthumb( + $this->fuckhtml + ->getTextContent( + $img[0]["attributes"]["src"] + ) + ) + ]; + }else{ + + $image = [ + "ratio" => null, + "url" => null + ]; + } + + // get page type + $pagetype_probe = + $this->fuckhtml + ->getElementsByTagName( + "b" + ); + + $pagetype = "web"; + foreach($pagetype_probe as $probe){ + + $pagetype = + strtolower( + trim( + $this->fuckhtml + ->getTextContent( + $probe + ), + " 【】" + ) + ); + } + + // get extra links + $sublinks = []; + + foreach($div as $d){ + + if( + isset($d["attributes"]["class"]) && + strpos($d["attributes"]["class"], "exta-link") !== false + ){ + + $this->fuckhtml->load($d); + + $links = + $this->fuckhtml + ->getElementsByClassName( + "cos-space-mt-xs", + "div" + ); + + foreach($links as $link){ + + $this->fuckhtml->load($link); + $s_title = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($s_title) === 0){ + + // should not happen + continue; + } + + $data2 = + json_decode( + $this->fuckhtml + ->getTextContent( + $s_title[0]["attributes"]["data-click"] + ), + true + ); + + if(!isset($data2["clk_info"])){ + + // wtf + continue; + } + + $data2 = + json_decode( + $data2["clk_info"], + true + ); + + if(!isset($data2["url"])){ + + // no link, fuck off + continue; + } + + $url = + rawurldecode( + $data2["url"] + ); + + $data = + $this->fuckhtml + ->getElementsByTagName( + "p" + ); + + $s_description = null; + + if(count($data) !== 0){ + + $data = + json_decode( + $this->fuckhtml + ->getTextContent( + $data[0]["attributes"]["sub-show-log"] + ), + true + ); + + if(isset($data["ext"]["content"])){ + + $s_description = $data["ext"]["content"]; + } + } + + $sublinks[] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $s_title[0] + ), + "description" => $s_description, + "url" => $url, + "date" => null + ]; + } + break; + } + } + + $out["web"][] = [ + "title" => $title, + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $datafield["attributes"]["mu"] + ), + "date" => $date, + "type" => $pagetype, + "thumb" => $image, + "sublink" => $sublinks, + "table" => [] + ]; + + continue; + } + + // parse special result + $result = + $this->fuckhtml + ->getElementsByClassName( + "result-op", + [$datafield] + ); + + if(count($result) !== 0){ + + // + // Parse video carousel + // + if( + isset($datafield["attributes"]["tpl"]) && + stripos($datafield["attributes"]["tpl"], "video") !== false + ){ + + preg_match( + '//U', + $datafield["innerHTML"], + $matches + ); + + if(isset($matches[1])){ + + $json = + json_decode( + $matches[1], + true + ); + + if($json !== null){ + + foreach($json["videoList"] as $video){ + + $out["video"][] = [ + "title" => $video["title"], + "description" => + $this->titledots( + $video["desc"] + ), + "date" => + $this->parse_time( + $video["pubTime"] + ), + "duration" => + $this->hms2int( + $video["duration"] + ), + "views" => + $this->parse_viewcount( + $video["playCount"] + ), + "thumb" => [ + "ratio" => "16:9", + "url" => $video["poster"] + ], + "url" => $video["bindProps"]["link"] + ]; + } + } + } + continue; + } + + // + // Special result div (wiki entries, rich divs) + // + $title = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($title) === 0){ + + // should have a title somewhere + continue; + } + + $title = + explode( + ">", + $this->fuckhtml + ->getTextContent( + $title[0] + ), + 2 + ); + + if(count($title) === 2){ + + $title = $title[1]; + }else{ + + $title = $title[0]; + } + + // probe for wiki-like entry + $description = + $this->fuckhtml + ->getElementsByClassName( + "sc-paragraph", + "p" + ); + + if(count($description) === 0){ + + // try and get grey description + $description = + $this->fuckhtml + ->getElementsByClassName( + "c-color-gray2", + "p" + ); + + if(count($description) === 0){ + + // probe for special social media description + $description = + $this->fuckhtml + ->getElementsByClassName( + "c-color-text", + "div" + ); + + if(isset($description[0]["attributes"]["aria-label"])){ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ["attributes"] + ["aria-label"] + ); + }else{ + + // check for news tab description + $span = + $this->fuckhtml + ->getElementsByClassName( + "c-font-normal", + "span" + ); + + $description = null; + + foreach($span as $s){ + + if(isset($s["attributes"]["aria-label"])){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $span[count($span) - 1] + ) + ); + + break; + } + } + } + }else{ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + } + + }else{ + + preg_match( + '/([\S\s]*)/U', + $description[count($description) - 1]["innerHTML"], + $matches + ); + + if(isset($matches[1])){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $matches[1] + ) + ); + }else{ + + $description = null; + } + } + + // get thumbnail + $thumb = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($thumb) !== 0){ + + $thumb = [ + "ratio" => "1:1", + "url" => + $this->unfuckthumb( + $this->fuckhtml + ->getTextContent( + $thumb[0]["attributes"]["src"] + ) + ) + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + // get sublinks + preg_match( + '//U', + $datafield["innerHTML"], + $matches + ); + + $sublinks = []; + + if(isset($matches[1])){ + + $json = + json_decode( + $matches[1], + true + ); + + if($json !== null){ + + if(isset($json["buttons"])){ + + foreach($json["buttons"] as $button){ + + $sublinks[] = [ + "title" => $button["text"], + "description" => null, + "date" => null, + "url" => $button["url"] + ]; + } + }elseif(isset($json["mthreadList"])){ + + foreach($json["mthreadList"] as $thread){ + + $sublinks[] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $thread["title"] + ), + "description" => null, + "date" => null, + "url" => $thread["ttsInfo"]["titleUrl"] + ]; + } + } + } + } + + // get URL + // handle http://fakeurl.baidu.com bullshit + $url = + $this->fuckhtml + ->getTextContent( + $datafield["attributes"]["mu"] + ); + + if( + preg_match( + '/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/', + $url + ) + ){ + + // we got some bullshit, get jumpUrl instead + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $url = + $this->fuckhtml + ->getTextContent( + $as[0]["attributes"]["href"] + ); + } + } + + // get xueshu sublinks + // get list + $xueshu_list = + $this->fuckhtml + ->getElementsByClassName( + "op-xueshu-links-d20-list", + $div + ); + + if(count($xueshu_list) !== 0){ + + $this->fuckhtml->load($xueshu_list[0]); + + $rows = + $this->fuckhtml + ->getElementsByClassName( + "c-row", + "div" + ); + + // remove "read more" bullshit + foreach($rows as $row){ + + if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){ + + $xueshu_list[0]["innerHTML"] = + str_replace( + $row["outerHTML"], + "", + $xueshu_list[0]["innerHTML"] + ); + } + } + + $this->fuckhtml->load($xueshu_list[0]); + + foreach($rows as $row){ + + $this->fuckhtml->load($row); + + if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){ + + continue; + } + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + foreach($as as $a){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $a + ) + ), + "description" => null, + "date" => null, + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ) + ]; + } + } + } + + $out["web"][] = [ + "title" => $title, + "description" => $description, + "url" => $url, + "date" => null, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => [] + ]; + continue; + } + } + + // + // Remove tracking URLs and fetch additonal image resources + // + $this->resolve_urls($proxy, $out, ["web", "video"]); + $this->resolve_images($proxy, $out); + + return $out; + } + + public function image($get){ + + // https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1 + //$json = file_get_contents("scraper/baidu_img.json"); + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "images"); + $params = json_decode($params, true); + + $params["pn"] = $params["pn"] + 60; + + }else{ + + $proxy = $this->backend->get_ip(); + $params = [ + "word" => $get["s"], + "rn" => 60, // results/page + "pn" => 0, // item increment (0 * 60) + "newReq" => 1 // otherwise json is fucked up + ]; + + switch($get["sort"]){ + + case "latest": $params["latest"] = 1; break; + case "hot": $params["hot"] = 1; break; + } + + if($get["size"] != "any"){ + + $params["z"] = $get["size"]; + } + + if($get["ratio"] != "any"){ + + $params["imgratio"] = $get["ratio"]; + } + + if($get["format"] != "any"){ + + $params["imgformat"] = $get["format"]; + } + + if($get["color"] != "any"){ + + $params["ic"] = $get["color"]; + } + + switch($get["type"]){ + + case "hd": $params["hd"] = 1; break; + case "isImgSet": $params["isImgSet"] = 1; break; + case "copyright": $params["copyright"] = 1; break; + } + } + + try{ + + $json = + $this->get( + $proxy, + "https://image.baidu.com/search/acjson", + $params, + "https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"]) + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + // detect captcha first + $this->fuckhtml->load($json); + $this->detect_ass(); + + // fallback to json decode error + throw new Exception("Failed to decode JSON"); + } + + if( + isset($json["message"]) && + $json["message"] != "success" + ){ + + throw new Exception("Baidu returned an error: {$json["message"]}"); + } + + if(!isset($json["data"]["images"])){ + + throw new Exception("Baidu did not return an image object"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + foreach($json["data"]["images"] as $image){ + + parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); + + $out["image"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $image["titleShow"] + ), + "source" => [ + [ + "url" => $image["objurl"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ // thumbnail + "url" => $image["thumburl"], + "width" => (int)$thumb_size["w"], + "height" => (int)$thumb_size["h"] + ] + ], + "url" => $image["fromUrl"] + ]; + } + + // + // Detect if there's a next page + // + if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){ + + $out["npt"] = + $this->backend->store( + json_encode($params), + "images", + $proxy + ); + } + + return $out; + } + + public function video($get){ + + // https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0 + // increase &pn +20 for pagination + + //$html = file_get_contents("scraper/baidu_vid.html"); + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "videos"); + $params = json_decode($params, true); + + $params["pn"] = $params["pn"] + 10; + }else{ + + $proxy = $this->backend->get_ip(); + $params = [ + "pd" => "video", + "tn" => "vsearch", + "wd" => $get["s"], + "async" => 1, + "pn" => 0 + ]; + } + + try{ + $html = + $this->get( + $proxy, + "https://www.baidu.com/sf/vsearch", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + + $html = + str_replace( + ["\r", "\n"], + "", + $html + ); + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + $html = explode("