backend = new backend("baidu"); include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); $this->handles = []; $this->proc = null; $this->handle_category = null; $this->handle_increment = 0; $this->sublink_increment = 0; $this->cookie = null; } public function getfilters($page){ switch($page){ case "web": return [ "newer" => [ "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" ] ]; break; case "images": return [ "sort" => [ "display" => "Sort", "option" => [ "relevance" => "Relevance", // no param "latest" => "Latest", // &latest=1 "hot" => "Hot" // &hot=1 ] ], "size" => [ "display" => "Size", "option" => [ "any" => "Any size", "7" => "Extra large (1080px+)", // &z=7 "6" => "Large (600px~1080px)", // &z=6 "5" => "Medium (300px~600px)", // &z=5 "4" => "Small (1px~300px)" // &z=4 ] ], "ratio" => [ "display" => "Ratio", "option" => [ "any" => "Any ratio", "1" => "Tall vertical", // &imgratio=1 "2" => "Vertical", // &imgratio=2 "3" => "Square", // &imgratio=3 "4" => "Horizontal", // &imgratio=4 "5" => "Wide horizontal" // &imgratio=5 ] ], "format" => [ "display" => "Format", "option" => [ "any" => "Any format", "3" => "JPG", // &imgformat=3 "5" => "JPEG", // &imgformat=5 "4" => "PNG", // &imgformat=4 "2" => "BMP", // &imgformat=2 "6" => "GIF (Animated)" // &imgformat=6 ] ], "color" => [ "display" => "Color", "option" => [ "any" => "Any color", "1024" => "White", // &ic=1024 "2048" => "Black & White", "512" => "Black", "64" => "Magenta", "16" => "Blue", "1" => "Red", "2" => "Yellow", "32" => "Purple", "4" => "Green", "8" => "Teal", "256" => "Orange", "128" => "Brown" ] ], "type" => [ "display" => "Type", "option" => [ "any" => "Any type", "hd" => "HD", // &hd=1 "isImgSet" => "Photo album", // &isImgSet=1 "copyright" => "Copyright" // ©right=1 ] ] ]; break; case "videos": return []; break; case "news": return [ "category" => [ "display" => "Category", "option" => [ "any" => "All news", "media" => "Media websites", // &medium=1 "baijiahao" => "Baidu Baijiahao" // &medium=2 ] ] ]; break; } } private function get($proxy, $url, $get = [], $referer = false){ $curlproc = curl_init(); if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } $cookies_tmp = []; curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ $length = strlen($header); $header = explode(":", $header, 2); if(trim(strtolower($header[0])) == "set-cookie"){ $cookie_tmp = explode("=", trim($header[1]), 2); $cookies_tmp[trim($cookie_tmp[0])] = explode(";", $cookie_tmp[1], 2)[0]; } return $length; }); curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding if($referer === false){ if($this->cookie === null){ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br, zstd", "DNT: 1", "Sec-GPC: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: cross-site", "Priority: u=0, i"] ); }else{ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br, zstd", "DNT: 1", "Sec-GPC: 1", "Connection: keep-alive", "Cookie: {$this->cookie}", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: cross-site", "Priority: u=0, i"] ); } }else{ if($this->cookie === null){ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: application/json, text/plain, */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br, zstd", "Referer: {$referer}", "DNT: 1", "Sec-GPC: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: empty", "Sec-Fetch-Mode: cors", "Sec-Fetch-Site: same-origin"] ); }else{ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: application/json, text/plain, */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br, zstd", "Referer: {$referer}", "DNT: 1", "Sec-GPC: 1", "Connection: keep-alive", "Cookie: {$this->cookie}", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: empty", "Sec-Fetch-Mode: cors", "Sec-Fetch-Site: same-origin"] ); } } curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } // store cookie if(strlen($this->cookie) !== 0){ $this->cookie .= "; "; } foreach($cookies_tmp as $cookie_name => $cookie_value){ $this->cookie .= $cookie_name . "=" . $cookie_value . "; "; } $this->cookie = rtrim($this->cookie, " ;"); curl_close($curlproc); return $data; } private function redirect_add_url($proxy, $url){ if( preg_match( '/^https?:\/\/(?:www\.)?baidu\.com\/link\?/', $url ) === 0 ){ // not a baidu redirect return; } $curlproc = curl_init(); curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br, zstd", "DNT: 1", "Sec-GPC: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1", "Priority: u=0, i"] ); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); curl_setopt($curlproc, CURLOPT_HEADER, true); curl_setopt($curlproc, CURLOPT_NOBODY, true); $this->backend->assign_proxy($curlproc, $proxy); curl_multi_add_handle($this->proc, $curlproc); $this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc; } private function resolve_urls($proxy, &$collection, $categories){ $this->proc = curl_multi_init(); curl_multi_select($this->proc); foreach($categories as $category){ $this->sublink_increment = 0; $this->handle_increment = 0; $this->handle_category = $category; foreach($collection[$category] as $item){ $this->sublink_increment = 0; $this->redirect_add_url($proxy, $item["url"]); if(isset($item["sublink"])){ foreach($item["sublink"] as $sublink){ $this->sublink_increment++; $this->redirect_add_url($proxy, $sublink["url"]); } } $this->handle_increment++; } } do{ $status = curl_multi_exec($this->proc, $active); }while($active && $status == CURLM_OK); // // if we reach this, we're done downloading garbage // foreach($this->handles as $category => $v){ foreach($v as $index => $data){ foreach($this->handles[$category][$index] as $sublinkindex => $handle){ preg_match( '/location: ?(.*)$/im', curl_multi_getcontent($handle), $location ); if(isset($location[1])){ if($sublinkindex === 0){ $collection[$category][$index]["url"] = trim($location[1]); }else{ $collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]); } } curl_multi_remove_handle($this->proc, $handle); curl_close($handle); } } } curl_multi_close($this->proc); } private function resolve_images($proxy, &$data){ // get the image viewer that contains all of the images direct URLs // for some reason, getting the second image's url in the set // doesnt trigger the captcha if( !isset($data["image"][1]["url"]) || preg_match( '/^https:\/\/image\.baidu\.com\/search\/detail/', $data["image"][1]["url"] ) === 0 ){ // we have an already resolved image link, do nothing return; } try{ $html = $this->get( $proxy, $data["image"][1]["url"], [] ); }catch(Exception $error){ // fallback to the limited dataset we have return; } $this->fuckhtml->load($html); $script = $this->fuckhtml ->getElementById( "image-detail-data", "script" ); if($script){ $json = json_decode( $script["innerHTML"], true ); if( !isset($json["data"]["images"]) || count($json["data"]["images"]) === 0 ){ // do nothing return; } // // Discard all previously scraped images and use data // from the newly downloaded image carousel // the imageset !!should!! be the same // $data["image"] = []; foreach($json["data"]["images"] as $image){ parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); $data["image"][] = [ "title" => $this->fuckhtml ->getTextContent( $image["titleShow"] ), "source" => [ [ "url" => $image["objurl"], "width" => (int)$image["width"], "height" => (int)$image["height"] ], [ // thumbnail "url" => $image["thumburl"], "width" => (int)$thumb_size["w"], "height" => (int)$thumb_size["h"] ] ], "url" => $image["fromUrl"] ]; } } } public function web($get){ if($get["npt"]){ [$json, $proxy] = $this->backend->get($get["npt"], "web"); $json = json_decode($json, true); $this->cookie = $json["cookie"]; $npt_data = $json["req"]; $npt_data["pn"] = $npt_data["pn"] + 20; try{ $html = $this->get( $proxy, "https://www.baidu.com/s", $npt_data ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } }else{ // // Get authentication token // $proxy = $this->backend->get_ip(); // running this will give us shit in $this->cookie // @TODO probably not needed? I get blocked anyways ffs //$this->get($proxy, "https://www.baidu.com", []); $npt_data = [ "wd" => $get["s"], "rn" => 20 ]; // &gpc=stf%3D0%2C1752638400|stftype%3D2 if( $get["older"] !== false || $get["newer"] !== false ){ if($get["older"] === false){ $get["older"] = 0; } $npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2"; } try{ $html = $this->get( $proxy, "https://www.baidu.com/s", $npt_data ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } $npt_data["pn"] = 0; } return $this->parse_search($proxy, "web", $npt_data, $html); } private function parse_search($proxy, $pagetype, $npt_data, $html){ // @HACK // remove newlines from the html, cause it fucks with fuckhtml $html = str_replace(["\n", "\r"], "", $html); $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; $this->fuckhtml->load($html); $this->detect_ass(); $datafields = $this->fuckhtml ->getElementsByAttributeName( "id", "div" ); // // Get next page // $npt = $this->fuckhtml ->getElementsByClassName( "n", "a" ); if(count($npt) !== 0){ $out["npt"] = $this->backend->store( json_encode([ "req" => $npt_data, "cookie" => $this->cookie ]), $pagetype, $proxy ); } // // Get related searches // $related_container = $this->fuckhtml ->getElementById( "rs_new", $datafields ); if($related_container){ $this->fuckhtml->load($related_container); $as = $this->fuckhtml ->getElementsByClassName( "c-color-link", "a" ); foreach($as as $a){ $text = explode( ">", $this->fuckhtml ->getTextContent( $a ), 2 ); $out["related"][] = $text[count($text) - 1]; } } foreach($datafields as $datafield){ if( !isset($datafield["attributes"]["id"]) || preg_match( '/^[0-9]+$/', $datafield["attributes"]["id"] ) === 0 ){ // not a search result continue; } $this->fuckhtml->load($datafield); $div = $this->fuckhtml ->getElementsByTagName( "div" ); // // Don't parse as a search result if it's a card // $card = $this->fuckhtml ->getElementsByClassName( "cosc-card", $div ); if(count($card) !== 0){ // // Parse chinese youtube shorts // $ytshorts_probe = $this->fuckhtml ->getElementsByClassName( "tts-b-item", $div ); if(count($ytshorts_probe) !== 0){ $videos = $this->fuckhtml ->getElementsByAttributeValue( "data-show", "list", $div ); foreach($videos as $video){ $this->fuckhtml->load($video); $title = $this->fuckhtml ->getElementsByClassName( "cosc-title-slot", "span" ); if(count($title) === 0){ continue; } $url = $this->fuckhtml ->getElementsByTagName( "a" ); if(count($url) === 0){ continue; } $image = $this->fuckhtml ->getElementsByClassName( "cos-image-body", "img" ); if(count($image) === 0){ $image = [ "ratio" => null, "url" => null ]; }else{ $image = [ "ratio" => "1:1", "url" => $this->fuckhtml ->getTextContent( $image[0]["attributes"]["src"] ) ]; } // get duration $divs = $this->fuckhtml ->getElementsByAttributeName( "class", "div" ); $duration = null; foreach($divs as $probe){ if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){ $duration = $this->hms2int( $this->fuckhtml ->getTextContent( $probe ) ); break; } } $out["video"][] = [ "title" => $this->fuckhtml ->getTextContent( $title[0] ), "description" => null, "date" => null, "duration" => $duration, "views" => null, "thumb" => $image, "url" => $this->fuckhtml ->getTextContent( $url[0]["attributes"]["href"] ) ]; } } // // Parse image carousel // $is_image_carousel = false; foreach($div as $d){ if( isset($d["attributes"]["class"]) && strpos($d["attributes"]["class"], "image-container") !== false ){ $is_image_carousel = true; break; } } if($is_image_carousel){ preg_match( '//U', $datafield["innerHTML"], $matches ); if(isset($matches[1])){ // weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong $json = $this->fuckhtml ->parseJsObject( str_replace( "-\-", "--", $matches[1] ) ); if( $json !== null && isset($json["imageList"][0]["images"]) ){ // parse image carousel foreach($json["imageList"][0]["images"] as $image){ parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); $out["image"][] = [ "title" => "image", "source" => [ [ "url" => $image["objurl"], "width" => (int)$image["width"], "height" => (int)$image["height"] ], [ // thumbnail "url" => $image["thumburl"], "width" => (int)$thumb_size["w"], "height" => (int)$thumb_size["h"] ] ], "url" => $image["jumpUrl"] ]; } } } } continue; } if(!isset($datafield["attributes"]["mu"])){ // dont scrape if we dont have the direct link continue; } // class:FYB_RD -> News garbage, IGNORE $result = $this->fuckhtml ->getElementsByClassName( "result", [$datafield] ); if(count($result) !== 0){ // // Parse normal search result // $title = $this->fuckhtml ->getElementsByClassName( "sc-link", "a" ); if(count($title) === 0){ // should not happen continue; } $title = $this->titledots( $this->fuckhtml ->getTextContent( $title[0] ) ); $description = $this->fuckhtml ->getElementsByClassName( "c-color", $div ); if(count($description) !== 0){ $this->fuckhtml->load($description[0]); $description = $this->fuckhtml ->getElementsByAttributeName( "class", "span" ); $found_desc = false; foreach($description as $desc){ if(stripos($desc["attributes"]["class"], "summary-text") !== false){ $found_desc = true; $description = $this->titledots( $this->fuckhtml ->getTextContent( $desc ) ); break; } } if($found_desc === false){ $description = null; } $this->fuckhtml->load($datafield); }else{ $description = null; } // parse date $date_probe = $this->fuckhtml ->getElementsByClassName( "cos-color-text-minor", "span" ); if(count($date_probe) !== 0){ $date = $this->parse_time( $this->fuckhtml ->getTextContent( $date_probe[0] ) ); }else{ $date = null; } // parse image $img = $this->fuckhtml ->getElementsByTagName( "img" ); if(count($img) !== 0){ $image = [ "ratio" => "16:9", "url" => $this->unfuckthumb( $this->fuckhtml ->getTextContent( $img[0]["attributes"]["src"] ) ) ]; }else{ $image = [ "ratio" => null, "url" => null ]; } // get page type $pagetype_probe = $this->fuckhtml ->getElementsByTagName( "b" ); $pagetype = "web"; foreach($pagetype_probe as $probe){ $pagetype = strtolower( trim( $this->fuckhtml ->getTextContent( $probe ), " 【】" ) ); } // get extra links $sublinks = []; foreach($div as $d){ if( isset($d["attributes"]["class"]) && strpos($d["attributes"]["class"], "exta-link") !== false ){ $this->fuckhtml->load($d); $links = $this->fuckhtml ->getElementsByClassName( "cos-space-mt-xs", "div" ); foreach($links as $link){ $this->fuckhtml->load($link); $s_title = $this->fuckhtml ->getElementsByTagName( "h3" ); if(count($s_title) === 0){ // should not happen continue; } $data2 = json_decode( $this->fuckhtml ->getTextContent( $s_title[0]["attributes"]["data-click"] ), true ); if(!isset($data2["clk_info"])){ // wtf continue; } $data2 = json_decode( $data2["clk_info"], true ); if(!isset($data2["url"])){ // no link, fuck off continue; } $url = rawurldecode( $data2["url"] ); $data = $this->fuckhtml ->getElementsByTagName( "p" ); $s_description = null; if(count($data) !== 0){ $data = json_decode( $this->fuckhtml ->getTextContent( $data[0]["attributes"]["sub-show-log"] ), true ); if(isset($data["ext"]["content"])){ $s_description = $data["ext"]["content"]; } } $sublinks[] = [ "title" => $this->fuckhtml ->getTextContent( $s_title[0] ), "description" => $s_description, "url" => $url, "date" => null ]; } break; } } $out["web"][] = [ "title" => $title, "description" => $description, "url" => $this->fuckhtml ->getTextContent( $datafield["attributes"]["mu"] ), "date" => $date, "type" => $pagetype, "thumb" => $image, "sublink" => $sublinks, "table" => [] ]; continue; } // parse special result $result = $this->fuckhtml ->getElementsByClassName( "result-op", [$datafield] ); if(count($result) !== 0){ // // Parse video carousel // if( isset($datafield["attributes"]["tpl"]) && stripos($datafield["attributes"]["tpl"], "video") !== false ){ preg_match( '//U', $datafield["innerHTML"], $matches ); if(isset($matches[1])){ $json = json_decode( $matches[1], true ); if($json !== null){ foreach($json["videoList"] as $video){ $out["video"][] = [ "title" => $video["title"], "description" => $this->titledots( $video["desc"] ), "date" => $this->parse_time( $video["pubTime"] ), "duration" => $this->hms2int( $video["duration"] ), "views" => $this->parse_viewcount( $video["playCount"] ), "thumb" => [ "ratio" => "16:9", "url" => $video["poster"] ], "url" => $video["bindProps"]["link"] ]; } } } continue; } // // Special result div (wiki entries, rich divs) // $title = $this->fuckhtml ->getElementsByTagName( "h3" ); if(count($title) === 0){ // should have a title somewhere continue; } $title = explode( ">", $this->fuckhtml ->getTextContent( $title[0] ), 2 ); if(count($title) === 2){ $title = $title[1]; }else{ $title = $title[0]; } // probe for wiki-like entry $description = $this->fuckhtml ->getElementsByClassName( "sc-paragraph", "p" ); if(count($description) === 0){ // try and get grey description $description = $this->fuckhtml ->getElementsByClassName( "c-color-gray2", "p" ); if(count($description) === 0){ // probe for special social media description $description = $this->fuckhtml ->getElementsByClassName( "c-color-text", "div" ); if(isset($description[0]["attributes"]["aria-label"])){ $description = $this->fuckhtml ->getTextContent( $description[0] ["attributes"] ["aria-label"] ); }else{ // check for news tab description $span = $this->fuckhtml ->getElementsByClassName( "c-font-normal", "span" ); $description = null; foreach($span as $s){ if(isset($s["attributes"]["aria-label"])){ $description = $this->titledots( $this->fuckhtml ->getTextContent( $span[count($span) - 1] ) ); break; } } } }else{ $description = $this->fuckhtml ->getTextContent( $description[0] ); } }else{ preg_match( '/([\S\s]*)/U', $description[count($description) - 1]["innerHTML"], $matches ); if(isset($matches[1])){ $description = $this->titledots( $this->fuckhtml ->getTextContent( $matches[1] ) ); }else{ $description = null; } } // get thumbnail $thumb = $this->fuckhtml ->getElementsByTagName( "img" ); if(count($thumb) !== 0){ $thumb = [ "ratio" => "1:1", "url" => $this->unfuckthumb( $this->fuckhtml ->getTextContent( $thumb[0]["attributes"]["src"] ) ) ]; }else{ $thumb = [ "ratio" => null, "url" => null ]; } // get sublinks preg_match( '//U', $datafield["innerHTML"], $matches ); $sublinks = []; if(isset($matches[1])){ $json = json_decode( $matches[1], true ); if($json !== null){ if(isset($json["buttons"])){ foreach($json["buttons"] as $button){ $sublinks[] = [ "title" => $button["text"], "description" => null, "date" => null, "url" => $button["url"] ]; } }elseif(isset($json["mthreadList"])){ foreach($json["mthreadList"] as $thread){ $sublinks[] = [ "title" => $this->fuckhtml ->getTextContent( $thread["title"] ), "description" => null, "date" => null, "url" => $thread["ttsInfo"]["titleUrl"] ]; } } } } // get URL // handle http://fakeurl.baidu.com bullshit $url = $this->fuckhtml ->getTextContent( $datafield["attributes"]["mu"] ); if( preg_match( '/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/', $url ) ){ // we got some bullshit, get jumpUrl instead $as = $this->fuckhtml ->getElementsByTagName( "a" ); if(count($as) !== 0){ $url = $this->fuckhtml ->getTextContent( $as[0]["attributes"]["href"] ); } } // get xueshu sublinks // get list $xueshu_list = $this->fuckhtml ->getElementsByClassName( "op-xueshu-links-d20-list", $div ); if(count($xueshu_list) !== 0){ $this->fuckhtml->load($xueshu_list[0]); $rows = $this->fuckhtml ->getElementsByClassName( "c-row", "div" ); // remove "read more" bullshit foreach($rows as $row){ if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){ $xueshu_list[0]["innerHTML"] = str_replace( $row["outerHTML"], "", $xueshu_list[0]["innerHTML"] ); } } $this->fuckhtml->load($xueshu_list[0]); foreach($rows as $row){ $this->fuckhtml->load($row); if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){ continue; } $as = $this->fuckhtml ->getElementsByTagName( "a" ); foreach($as as $a){ $sublinks[] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $a ) ), "description" => null, "date" => null, "url" => $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ) ]; } } } $out["web"][] = [ "title" => $title, "description" => $description, "url" => $url, "date" => null, "type" => "web", "thumb" => $thumb, "sublink" => $sublinks, "table" => [] ]; continue; } } // // Remove tracking URLs and fetch additonal image resources // $this->resolve_urls($proxy, $out, ["web", "video"]); $this->resolve_images($proxy, $out); return $out; } public function image($get){ // https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1 //$json = file_get_contents("scraper/baidu_img.json"); if($get["npt"]){ [$params, $proxy] = $this->backend->get($get["npt"], "images"); $params = json_decode($params, true); $params["pn"] = $params["pn"] + 60; }else{ $proxy = $this->backend->get_ip(); $params = [ "word" => $get["s"], "rn" => 60, // results/page "pn" => 0, // item increment (0 * 60) "newReq" => 1 // otherwise json is fucked up ]; switch($get["sort"]){ case "latest": $params["latest"] = 1; break; case "hot": $params["hot"] = 1; break; } if($get["size"] != "any"){ $params["z"] = $get["size"]; } if($get["ratio"] != "any"){ $params["imgratio"] = $get["ratio"]; } if($get["format"] != "any"){ $params["imgformat"] = $get["format"]; } if($get["color"] != "any"){ $params["ic"] = $get["color"]; } switch($get["type"]){ case "hd": $params["hd"] = 1; break; case "isImgSet": $params["isImgSet"] = 1; break; case "copyright": $params["copyright"] = 1; break; } } try{ $json = $this->get( $proxy, "https://image.baidu.com/search/acjson", $params, "https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"]) ); }catch(Exception $error){ throw new Exception("Failed to fetch JSON"); } $json = json_decode($json, true); if($json === null){ // detect captcha first $this->fuckhtml->load($json); $this->detect_ass(); // fallback to json decode error throw new Exception("Failed to decode JSON"); } if( isset($json["message"]) && $json["message"] != "success" ){ throw new Exception("Baidu returned an error: {$json["message"]}"); } if(!isset($json["data"]["images"])){ throw new Exception("Baidu did not return an image object"); } $out = [ "status" => "ok", "npt" => null, "image" => [] ]; foreach($json["data"]["images"] as $image){ parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); $out["image"][] = [ "title" => $this->fuckhtml ->getTextContent( $image["titleShow"] ), "source" => [ [ "url" => $image["objurl"], "width" => (int)$image["width"], "height" => (int)$image["height"] ], [ // thumbnail "url" => $image["thumburl"], "width" => (int)$thumb_size["w"], "height" => (int)$thumb_size["h"] ] ], "url" => $image["fromUrl"] ]; } // // Detect if there's a next page // if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){ $out["npt"] = $this->backend->store( json_encode($params), "images", $proxy ); } return $out; } public function video($get){ // https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0 // increase &pn +20 for pagination //$html = file_get_contents("scraper/baidu_vid.html"); if($get["npt"]){ [$params, $proxy] = $this->backend->get($get["npt"], "videos"); $params = json_decode($params, true); $params["pn"] = $params["pn"] + 10; }else{ $proxy = $this->backend->get_ip(); $params = [ "pd" => "video", "tn" => "vsearch", "wd" => $get["s"], "async" => 1, "pn" => 0 ]; } try{ $html = $this->get( $proxy, "https://www.baidu.com/sf/vsearch", $params ); }catch(Exception $error){ throw new Exception("Failed to get search page"); } $html = str_replace( ["\r", "\n"], "", $html ); $out = [ "status" => "ok", "npt" => null, "video" => [], "author" => [], "livestream" => [], "playlist" => [], "reel" => [] ]; $html = explode("