2230 lines
		
	
	
		
			42 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			2230 lines
		
	
	
		
			42 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
<?php
 | 
						||
 | 
						||
class baidu{
 | 
						||
	
 | 
						||
	public function __construct(){
 | 
						||
		
 | 
						||
		include "lib/backend.php";
 | 
						||
		$this->backend = new backend("baidu");
 | 
						||
		
 | 
						||
		include "lib/fuckhtml.php";
 | 
						||
		$this->fuckhtml = new fuckhtml();
 | 
						||
		
 | 
						||
		$this->handles = [];
 | 
						||
		$this->proc = null;
 | 
						||
		$this->handle_category = null;
 | 
						||
		$this->handle_increment = 0;
 | 
						||
		$this->sublink_increment = 0;
 | 
						||
		
 | 
						||
		$this->cookie = null;
 | 
						||
	}
 | 
						||
	
 | 
						||
	public function getfilters($page){
 | 
						||
		
 | 
						||
		switch($page){
 | 
						||
			
 | 
						||
			case "web":
 | 
						||
				return
 | 
						||
					[
 | 
						||
						"newer" => [
 | 
						||
							"display" => "Newer than",
 | 
						||
							"option" => "_DATE"
 | 
						||
						],
 | 
						||
						"older" => [
 | 
						||
							"display" => "Older than",
 | 
						||
							"option" => "_DATE"
 | 
						||
						]
 | 
						||
					];
 | 
						||
				break;
 | 
						||
			
 | 
						||
			case "images":
 | 
						||
				return
 | 
						||
					[
 | 
						||
						"sort" => [
 | 
						||
							"display" => "Sort",
 | 
						||
							"option" => [
 | 
						||
								"relevance" => "Relevance", // no param
 | 
						||
								"latest" => "Latest", // &latest=1
 | 
						||
								"hot" => "Hot" // &hot=1
 | 
						||
							]
 | 
						||
						],
 | 
						||
						"size" => [
 | 
						||
							"display" => "Size",
 | 
						||
							"option" => [
 | 
						||
								"any" => "Any size",
 | 
						||
								"7" => "Extra large (1080px+)", // &z=7
 | 
						||
								"6" => "Large (600px~1080px)", // &z=6
 | 
						||
								"5" => "Medium (300px~600px)", // &z=5
 | 
						||
								"4" => "Small (1px~300px)" // &z=4
 | 
						||
							]
 | 
						||
						],
 | 
						||
						"ratio" => [
 | 
						||
							"display" => "Ratio",
 | 
						||
							"option" => [
 | 
						||
								"any" => "Any ratio",
 | 
						||
								"1" => "Tall vertical", // &imgratio=1
 | 
						||
								"2" => "Vertical", // &imgratio=2
 | 
						||
								"3" => "Square", // &imgratio=3
 | 
						||
								"4" => "Horizontal", // &imgratio=4
 | 
						||
								"5" => "Wide horizontal" // &imgratio=5
 | 
						||
							]
 | 
						||
						],
 | 
						||
						"format" => [
 | 
						||
							"display" => "Format",
 | 
						||
							"option" => [
 | 
						||
								"any" => "Any format",
 | 
						||
								"3" => "JPG", // &imgformat=3
 | 
						||
								"5" => "JPEG", // &imgformat=5
 | 
						||
								"4" => "PNG", // &imgformat=4
 | 
						||
								"2" => "BMP", // &imgformat=2
 | 
						||
								"6" => "GIF (Animated)" // &imgformat=6
 | 
						||
							]
 | 
						||
						],
 | 
						||
						"color" => [
 | 
						||
							"display" => "Color",
 | 
						||
							"option" => [
 | 
						||
								"any" => "Any color",
 | 
						||
								"1024" => "White", // &ic=1024
 | 
						||
								"2048" => "Black & White",
 | 
						||
								"512" => "Black",
 | 
						||
								"64" => "Magenta",
 | 
						||
								"16" => "Blue",
 | 
						||
								"1" => "Red",
 | 
						||
								"2" => "Yellow",
 | 
						||
								"32" => "Purple",
 | 
						||
								"4" => "Green",
 | 
						||
								"8" => "Teal",
 | 
						||
								"256" => "Orange",
 | 
						||
								"128" => "Brown"
 | 
						||
							]
 | 
						||
						],
 | 
						||
						"type" => [
 | 
						||
							"display" => "Type",
 | 
						||
							"option" => [
 | 
						||
								"any" => "Any type",
 | 
						||
								"hd" => "HD", // &hd=1
 | 
						||
								"isImgSet" => "Photo album", // &isImgSet=1
 | 
						||
								"copyright" => "Copyright" // ©right=1
 | 
						||
							]
 | 
						||
						]
 | 
						||
					];
 | 
						||
				break;
 | 
						||
			
 | 
						||
			case "videos":
 | 
						||
				return [];
 | 
						||
				break;
 | 
						||
			
 | 
						||
			case "news":
 | 
						||
				return [
 | 
						||
					"category" => [
 | 
						||
						"display" => "Category",
 | 
						||
						"option" => [
 | 
						||
							"any" => "All news",
 | 
						||
							"media" => "Media websites", // &medium=1
 | 
						||
							"baijiahao" => "Baidu Baijiahao" // &medium=2
 | 
						||
						]
 | 
						||
					]
 | 
						||
				];
 | 
						||
				break;
 | 
						||
		}
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function get($proxy, $url, $get = [], $referer = false){
 | 
						||
		
 | 
						||
		$curlproc = curl_init();
 | 
						||
		
 | 
						||
		if($get !== []){
 | 
						||
			$get = http_build_query($get);
 | 
						||
			$url .= "?" . $get;
 | 
						||
		}
 | 
						||
		
 | 
						||
		$cookies_tmp = [];
 | 
						||
		curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
 | 
						||
			
 | 
						||
			$length = strlen($header);
 | 
						||
			
 | 
						||
			$header = explode(":", $header, 2);
 | 
						||
			
 | 
						||
			if(trim(strtolower($header[0])) == "set-cookie"){
 | 
						||
				
 | 
						||
				$cookie_tmp = explode("=", trim($header[1]), 2);
 | 
						||
				
 | 
						||
				$cookies_tmp[trim($cookie_tmp[0])] =
 | 
						||
					explode(";", $cookie_tmp[1], 2)[0];
 | 
						||
			}
 | 
						||
			
 | 
						||
			return $length;
 | 
						||
		});
 | 
						||
		
 | 
						||
		curl_setopt($curlproc, CURLOPT_URL, $url);
 | 
						||
		
 | 
						||
		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | 
						||
		
 | 
						||
		if($referer === false){
 | 
						||
			if($this->cookie === null){
 | 
						||
				
 | 
						||
				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | 
						||
					["User-Agent: " . config::USER_AGENT,
 | 
						||
					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 | 
						||
					"Accept-Language: en-US,en;q=0.5",
 | 
						||
					"Accept-Encoding: gzip, deflate, br, zstd",
 | 
						||
					"DNT: 1",
 | 
						||
					"Sec-GPC: 1",
 | 
						||
					"Connection: keep-alive",
 | 
						||
					"Upgrade-Insecure-Requests: 1",
 | 
						||
					"Sec-Fetch-Dest: document",
 | 
						||
					"Sec-Fetch-Mode: navigate",
 | 
						||
					"Sec-Fetch-Site: cross-site",
 | 
						||
					"Priority: u=0, i"]
 | 
						||
				);
 | 
						||
			}else{
 | 
						||
				
 | 
						||
				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | 
						||
					["User-Agent: " . config::USER_AGENT,
 | 
						||
					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 | 
						||
					"Accept-Language: en-US,en;q=0.5",
 | 
						||
					"Accept-Encoding: gzip, deflate, br, zstd",
 | 
						||
					"DNT: 1",
 | 
						||
					"Sec-GPC: 1",
 | 
						||
					"Connection: keep-alive",
 | 
						||
					"Cookie: {$this->cookie}",
 | 
						||
					"Upgrade-Insecure-Requests: 1",
 | 
						||
					"Sec-Fetch-Dest: document",
 | 
						||
					"Sec-Fetch-Mode: navigate",
 | 
						||
					"Sec-Fetch-Site: cross-site",
 | 
						||
					"Priority: u=0, i"]
 | 
						||
				);
 | 
						||
			}
 | 
						||
		}else{
 | 
						||
			
 | 
						||
			if($this->cookie === null){
 | 
						||
				
 | 
						||
				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | 
						||
					["User-Agent: " . config::USER_AGENT,
 | 
						||
					"Accept: application/json, text/plain, */*",
 | 
						||
					"Accept-Language: en-US,en;q=0.5",
 | 
						||
					"Accept-Encoding: gzip, deflate, br, zstd",
 | 
						||
					"Referer: {$referer}",
 | 
						||
					"DNT: 1",
 | 
						||
					"Sec-GPC: 1",
 | 
						||
					"Connection: keep-alive",
 | 
						||
					"Upgrade-Insecure-Requests: 1",
 | 
						||
					"Sec-Fetch-Dest: empty",
 | 
						||
					"Sec-Fetch-Mode: cors",
 | 
						||
					"Sec-Fetch-Site: same-origin"]
 | 
						||
				);
 | 
						||
			}else{
 | 
						||
				
 | 
						||
				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | 
						||
					["User-Agent: " . config::USER_AGENT,
 | 
						||
					"Accept: application/json, text/plain, */*",
 | 
						||
					"Accept-Language: en-US,en;q=0.5",
 | 
						||
					"Accept-Encoding: gzip, deflate, br, zstd",
 | 
						||
					"Referer: {$referer}",
 | 
						||
					"DNT: 1",
 | 
						||
					"Sec-GPC: 1",
 | 
						||
					"Connection: keep-alive",
 | 
						||
					"Cookie: {$this->cookie}",
 | 
						||
					"Upgrade-Insecure-Requests: 1",
 | 
						||
					"Sec-Fetch-Dest: empty",
 | 
						||
					"Sec-Fetch-Mode: cors",
 | 
						||
					"Sec-Fetch-Site: same-origin"]
 | 
						||
				);
 | 
						||
			}
 | 
						||
		}
 | 
						||
		
 | 
						||
		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
 | 
						||
		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 | 
						||
		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 | 
						||
		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
 | 
						||
		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
 | 
						||
		
 | 
						||
		$this->backend->assign_proxy($curlproc, $proxy);
 | 
						||
		
 | 
						||
		$data = curl_exec($curlproc);
 | 
						||
		
 | 
						||
		if(curl_errno($curlproc)){
 | 
						||
			
 | 
						||
			throw new Exception(curl_error($curlproc));
 | 
						||
		}
 | 
						||
		
 | 
						||
		// store cookie
 | 
						||
		if(strlen($this->cookie) !== 0){
 | 
						||
			
 | 
						||
			$this->cookie .= "; ";
 | 
						||
		}
 | 
						||
		
 | 
						||
		foreach($cookies_tmp as $cookie_name => $cookie_value){
 | 
						||
			
 | 
						||
			$this->cookie .= $cookie_name . "=" . $cookie_value . "; ";
 | 
						||
		}
 | 
						||
		
 | 
						||
		$this->cookie = rtrim($this->cookie, " ;");
 | 
						||
		
 | 
						||
		curl_close($curlproc);
 | 
						||
		return $data;
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function redirect_add_url($proxy, $url){
 | 
						||
		
 | 
						||
		if(
 | 
						||
			preg_match(
 | 
						||
				'/^https?:\/\/(?:www\.)?baidu\.com\/link\?/',
 | 
						||
				$url
 | 
						||
			) === 0
 | 
						||
		){
 | 
						||
			
 | 
						||
			// not a baidu redirect
 | 
						||
			return;
 | 
						||
		}
 | 
						||
		
 | 
						||
		$curlproc = curl_init();
 | 
						||
		
 | 
						||
		curl_setopt($curlproc, CURLOPT_URL, $url);
 | 
						||
		
 | 
						||
		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | 
						||
		curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | 
						||
			["User-Agent: " . config::USER_AGENT,
 | 
						||
			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 | 
						||
			"Accept-Language: en-US,en;q=0.5",
 | 
						||
			"Accept-Encoding: gzip, deflate, br, zstd",
 | 
						||
			"DNT: 1",
 | 
						||
			"Sec-GPC: 1",
 | 
						||
			"Connection: keep-alive",
 | 
						||
			"Upgrade-Insecure-Requests: 1",
 | 
						||
			"Sec-Fetch-Dest: document",
 | 
						||
			"Sec-Fetch-Mode: navigate",
 | 
						||
			"Sec-Fetch-Site: none",
 | 
						||
			"Sec-Fetch-User: ?1",
 | 
						||
			"Priority: u=0, i"]
 | 
						||
		);
 | 
						||
		
 | 
						||
		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
 | 
						||
		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 | 
						||
		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 | 
						||
		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
 | 
						||
		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
 | 
						||
		
 | 
						||
		curl_setopt($curlproc, CURLOPT_HEADER, true);
 | 
						||
		curl_setopt($curlproc, CURLOPT_NOBODY, true);
 | 
						||
		
 | 
						||
		$this->backend->assign_proxy($curlproc, $proxy);
 | 
						||
		
 | 
						||
		curl_multi_add_handle($this->proc, $curlproc);
 | 
						||
		$this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc;
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function resolve_urls($proxy, &$collection, $categories){
 | 
						||
		
 | 
						||
		$this->proc = curl_multi_init();
 | 
						||
		curl_multi_select($this->proc);
 | 
						||
		
 | 
						||
		foreach($categories as $category){
 | 
						||
			
 | 
						||
			$this->sublink_increment = 0;
 | 
						||
			$this->handle_increment = 0;
 | 
						||
			$this->handle_category = $category;
 | 
						||
			
 | 
						||
			foreach($collection[$category] as $item){
 | 
						||
				
 | 
						||
				$this->sublink_increment = 0;
 | 
						||
				$this->redirect_add_url($proxy, $item["url"]);
 | 
						||
				
 | 
						||
				if(isset($item["sublink"])){
 | 
						||
					
 | 
						||
					foreach($item["sublink"] as $sublink){
 | 
						||
						
 | 
						||
						$this->sublink_increment++;
 | 
						||
						$this->redirect_add_url($proxy, $sublink["url"]);
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				$this->handle_increment++;
 | 
						||
			}
 | 
						||
		}
 | 
						||
		
 | 
						||
		do{
 | 
						||
			$status = curl_multi_exec($this->proc, $active);
 | 
						||
			
 | 
						||
		}while($active && $status == CURLM_OK);
 | 
						||
		
 | 
						||
		//
 | 
						||
		// if we reach this, we're done downloading garbage
 | 
						||
		//
 | 
						||
		
 | 
						||
		foreach($this->handles as $category => $v){
 | 
						||
			
 | 
						||
			foreach($v as $index => $data){
 | 
						||
				
 | 
						||
				foreach($this->handles[$category][$index] as $sublinkindex => $handle){
 | 
						||
					
 | 
						||
					preg_match(
 | 
						||
						'/location: ?(.*)$/im',
 | 
						||
						curl_multi_getcontent($handle),
 | 
						||
						$location
 | 
						||
					);
 | 
						||
					
 | 
						||
					if(isset($location[1])){
 | 
						||
						
 | 
						||
						if($sublinkindex === 0){
 | 
						||
							
 | 
						||
							$collection[$category][$index]["url"] = trim($location[1]);
 | 
						||
						}else{
 | 
						||
							
 | 
						||
							$collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]);
 | 
						||
						}
 | 
						||
					}
 | 
						||
					
 | 
						||
					curl_multi_remove_handle($this->proc, $handle);
 | 
						||
					curl_close($handle);
 | 
						||
				}
 | 
						||
			}
 | 
						||
		}
 | 
						||
		
 | 
						||
		curl_multi_close($this->proc);
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function resolve_images($proxy, &$data){
 | 
						||
		
 | 
						||
		// get the image viewer that contains all of the images direct URLs
 | 
						||
		// for some reason, getting the second image's url in the set
 | 
						||
		// doesnt trigger the captcha
 | 
						||
		
 | 
						||
		if(
 | 
						||
			!isset($data["image"][1]["url"]) ||
 | 
						||
			preg_match(
 | 
						||
				'/^https:\/\/image\.baidu\.com\/search\/detail/',
 | 
						||
				$data["image"][1]["url"]
 | 
						||
			) === 0
 | 
						||
		){
 | 
						||
			
 | 
						||
			// we have an already resolved image link, do nothing
 | 
						||
			return;
 | 
						||
		}
 | 
						||
		
 | 
						||
		try{
 | 
						||
			
 | 
						||
			$html =
 | 
						||
				$this->get(
 | 
						||
					$proxy,
 | 
						||
					$data["image"][1]["url"],
 | 
						||
					[]
 | 
						||
				);
 | 
						||
		}catch(Exception $error){
 | 
						||
			
 | 
						||
			// fallback to the limited dataset we have
 | 
						||
			return;
 | 
						||
		}
 | 
						||
		
 | 
						||
		$this->fuckhtml->load($html);
 | 
						||
		
 | 
						||
		$script =
 | 
						||
			$this->fuckhtml
 | 
						||
			->getElementById(
 | 
						||
				"image-detail-data",
 | 
						||
				"script"
 | 
						||
			);
 | 
						||
		
 | 
						||
		if($script){
 | 
						||
			
 | 
						||
			$json =
 | 
						||
				json_decode(
 | 
						||
					$script["innerHTML"],
 | 
						||
					true
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(
 | 
						||
				!isset($json["data"]["images"]) ||
 | 
						||
				count($json["data"]["images"]) === 0
 | 
						||
			){
 | 
						||
				
 | 
						||
				// do nothing
 | 
						||
				return;
 | 
						||
			}
 | 
						||
			
 | 
						||
			//
 | 
						||
			// Discard all previously scraped images and use data
 | 
						||
			// from the newly downloaded image carousel
 | 
						||
			// the imageset !!should!! be the same
 | 
						||
			//
 | 
						||
			$data["image"] = [];
 | 
						||
			
 | 
						||
			foreach($json["data"]["images"] as $image){
 | 
						||
				
 | 
						||
				parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
 | 
						||
				
 | 
						||
				$data["image"][] = [
 | 
						||
					"title" =>
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$image["titleShow"]
 | 
						||
						),
 | 
						||
					"source" => [
 | 
						||
						[
 | 
						||
							"url" => $image["objurl"],
 | 
						||
							"width" => (int)$image["width"],
 | 
						||
							"height" => (int)$image["height"]
 | 
						||
						],
 | 
						||
						[ // thumbnail
 | 
						||
							"url" => $image["thumburl"],
 | 
						||
							"width" => (int)$thumb_size["w"],
 | 
						||
							"height" => (int)$thumb_size["h"]
 | 
						||
						]
 | 
						||
					],
 | 
						||
					"url" => $image["fromUrl"]
 | 
						||
				];
 | 
						||
			}
 | 
						||
		}
 | 
						||
	}
 | 
						||
	
 | 
						||
	public function web($get){
 | 
						||
		
 | 
						||
		if($get["npt"]){
 | 
						||
			
 | 
						||
			[$json, $proxy] = $this->backend->get($get["npt"], "web");
 | 
						||
			
 | 
						||
			$json = json_decode($json, true);
 | 
						||
			$this->cookie = $json["cookie"];
 | 
						||
			$npt_data = $json["req"];
 | 
						||
			
 | 
						||
			$npt_data["pn"] = $npt_data["pn"] + 20;
 | 
						||
			
 | 
						||
			try{
 | 
						||
				
 | 
						||
				$html = $this->get(
 | 
						||
					$proxy,
 | 
						||
					"https://www.baidu.com/s",
 | 
						||
					$npt_data
 | 
						||
				);
 | 
						||
			}catch(Exception $error){
 | 
						||
				
 | 
						||
				throw new Exception("Failed to fetch search page");
 | 
						||
			}
 | 
						||
			
 | 
						||
		}else{
 | 
						||
			
 | 
						||
			//
 | 
						||
			// Get authentication token
 | 
						||
			//
 | 
						||
			$proxy = $this->backend->get_ip();
 | 
						||
			
 | 
						||
			// running this will give us shit in $this->cookie
 | 
						||
			// @TODO probably not needed? I get blocked anyways ffs
 | 
						||
			//$this->get($proxy, "https://www.baidu.com", []);
 | 
						||
			
 | 
						||
			$npt_data = [
 | 
						||
				"wd" => $get["s"],
 | 
						||
				"rn" => 20
 | 
						||
			];
 | 
						||
			
 | 
						||
			// &gpc=stf%3D0%2C1752638400|stftype%3D2
 | 
						||
			if(
 | 
						||
				$get["older"] !== false ||
 | 
						||
				$get["newer"] !== false
 | 
						||
			){
 | 
						||
				
 | 
						||
				if($get["older"] === false){
 | 
						||
					
 | 
						||
					$get["older"] = 0;
 | 
						||
				}
 | 
						||
				
 | 
						||
				$npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2";
 | 
						||
			}
 | 
						||
			
 | 
						||
			try{
 | 
						||
				
 | 
						||
				$html = $this->get(
 | 
						||
					$proxy,
 | 
						||
					"https://www.baidu.com/s",
 | 
						||
					$npt_data
 | 
						||
				);
 | 
						||
			}catch(Exception $error){
 | 
						||
				
 | 
						||
				throw new Exception("Failed to fetch search page");
 | 
						||
			}
 | 
						||
			
 | 
						||
			$npt_data["pn"] = 0;
 | 
						||
		}
 | 
						||
		
 | 
						||
		return $this->parse_search($proxy, "web", $npt_data, $html);
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function parse_search($proxy, $pagetype, $npt_data, $html){
 | 
						||
		
 | 
						||
		// @HACK
 | 
						||
		// remove newlines from the html, cause it fucks with fuckhtml
 | 
						||
		$html = str_replace(["\n", "\r"], "", $html);
 | 
						||
		
 | 
						||
		$out = [
 | 
						||
			"status" => "ok",
 | 
						||
			"spelling" => [
 | 
						||
				"type" => "no_correction",
 | 
						||
				"using" => null,
 | 
						||
				"correction" => null
 | 
						||
			],
 | 
						||
			"npt" => null,
 | 
						||
			"answer" => [],
 | 
						||
			"web" => [],
 | 
						||
			"image" => [],
 | 
						||
			"video" => [],
 | 
						||
			"news" => [],
 | 
						||
			"related" => []
 | 
						||
		];
 | 
						||
		
 | 
						||
		$this->fuckhtml->load($html);
 | 
						||
		
 | 
						||
		$this->detect_ass();
 | 
						||
		
 | 
						||
		$datafields =
 | 
						||
			$this->fuckhtml
 | 
						||
			->getElementsByAttributeName(
 | 
						||
				"id",
 | 
						||
				"div"
 | 
						||
			);
 | 
						||
		
 | 
						||
		//
 | 
						||
		// Get next page
 | 
						||
		//
 | 
						||
		$npt =
 | 
						||
			$this->fuckhtml
 | 
						||
			->getElementsByClassName(
 | 
						||
				"n",
 | 
						||
				"a"
 | 
						||
			);
 | 
						||
		
 | 
						||
		if(count($npt) !== 0){
 | 
						||
			
 | 
						||
			$out["npt"] =
 | 
						||
				$this->backend->store(
 | 
						||
					json_encode([
 | 
						||
						"req" => $npt_data,
 | 
						||
						"cookie" => $this->cookie
 | 
						||
					]),
 | 
						||
					$pagetype,
 | 
						||
					$proxy
 | 
						||
				);
 | 
						||
		}
 | 
						||
		
 | 
						||
		//
 | 
						||
		// Get related searches
 | 
						||
		//
 | 
						||
		$related_container =
 | 
						||
			$this->fuckhtml
 | 
						||
			->getElementById(
 | 
						||
				"rs_new",
 | 
						||
				$datafields
 | 
						||
			);
 | 
						||
		
 | 
						||
		if($related_container){
 | 
						||
			
 | 
						||
			$this->fuckhtml->load($related_container);
 | 
						||
			
 | 
						||
			$as =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"c-color-link",
 | 
						||
					"a"
 | 
						||
				);
 | 
						||
			
 | 
						||
			foreach($as as $a){
 | 
						||
				
 | 
						||
				$text =
 | 
						||
					explode(
 | 
						||
						">",
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$a
 | 
						||
						),
 | 
						||
						2
 | 
						||
					);
 | 
						||
				
 | 
						||
				$out["related"][] = $text[count($text) - 1];
 | 
						||
			}
 | 
						||
		}
 | 
						||
		
 | 
						||
		foreach($datafields as $datafield){
 | 
						||
			
 | 
						||
			if(
 | 
						||
				!isset($datafield["attributes"]["id"]) ||
 | 
						||
				preg_match(
 | 
						||
					'/^[0-9]+$/',
 | 
						||
					$datafield["attributes"]["id"]
 | 
						||
				) === 0
 | 
						||
			){
 | 
						||
				
 | 
						||
				// not a search result
 | 
						||
				continue;
 | 
						||
			}
 | 
						||
			
 | 
						||
			$this->fuckhtml->load($datafield);
 | 
						||
			$div =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByTagName(
 | 
						||
					"div"
 | 
						||
				);
 | 
						||
			
 | 
						||
			//
 | 
						||
			// Don't parse as a search result if it's a card
 | 
						||
			//
 | 
						||
			$card =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"cosc-card",
 | 
						||
					$div
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(count($card) !== 0){
 | 
						||
				
 | 
						||
				//
 | 
						||
				// Parse chinese youtube shorts
 | 
						||
				//
 | 
						||
				$ytshorts_probe =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"tts-b-item",
 | 
						||
						$div
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($ytshorts_probe) !== 0){
 | 
						||
					
 | 
						||
					$videos =
 | 
						||
						$this->fuckhtml
 | 
						||
						->getElementsByAttributeValue(
 | 
						||
							"data-show",
 | 
						||
							"list",
 | 
						||
							$div
 | 
						||
						);
 | 
						||
					
 | 
						||
					foreach($videos as $video){
 | 
						||
						
 | 
						||
						$this->fuckhtml->load($video);
 | 
						||
						
 | 
						||
						$title =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getElementsByClassName(
 | 
						||
								"cosc-title-slot",
 | 
						||
								"span"
 | 
						||
							);
 | 
						||
						
 | 
						||
						if(count($title) === 0){
 | 
						||
							
 | 
						||
							continue;
 | 
						||
						}
 | 
						||
						
 | 
						||
						$url =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getElementsByTagName(
 | 
						||
								"a"
 | 
						||
							);
 | 
						||
						
 | 
						||
						if(count($url) === 0){
 | 
						||
							
 | 
						||
							continue;
 | 
						||
						}
 | 
						||
						
 | 
						||
						$image =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getElementsByClassName(
 | 
						||
								"cos-image-body",
 | 
						||
								"img"
 | 
						||
							);
 | 
						||
						
 | 
						||
						if(count($image) === 0){
 | 
						||
							
 | 
						||
							$image = [
 | 
						||
								"ratio" => null,
 | 
						||
								"url" => null
 | 
						||
							];
 | 
						||
						}else{
 | 
						||
							
 | 
						||
							$image = [
 | 
						||
								"ratio" => "1:1",
 | 
						||
								"url" =>
 | 
						||
									$this->fuckhtml
 | 
						||
									->getTextContent(
 | 
						||
										$image[0]["attributes"]["src"]
 | 
						||
									)
 | 
						||
							];
 | 
						||
						}
 | 
						||
						
 | 
						||
						// get duration
 | 
						||
						$divs =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getElementsByAttributeName(
 | 
						||
								"class",
 | 
						||
								"div"
 | 
						||
							);
 | 
						||
						
 | 
						||
						$duration = null;
 | 
						||
						foreach($divs as $probe){
 | 
						||
							
 | 
						||
							if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){
 | 
						||
								
 | 
						||
								$duration =
 | 
						||
									$this->hms2int(
 | 
						||
										$this->fuckhtml
 | 
						||
										->getTextContent(
 | 
						||
											$probe
 | 
						||
										)
 | 
						||
									);
 | 
						||
								break;
 | 
						||
							}
 | 
						||
						}
 | 
						||
						
 | 
						||
						$out["video"][] = [
 | 
						||
							"title" =>
 | 
						||
								$this->fuckhtml
 | 
						||
								->getTextContent(
 | 
						||
									$title[0]
 | 
						||
								),
 | 
						||
							"description" => null,
 | 
						||
							"date" => null,
 | 
						||
							"duration" => $duration,
 | 
						||
							"views" => null,
 | 
						||
							"thumb" => $image,
 | 
						||
							"url" =>
 | 
						||
								$this->fuckhtml
 | 
						||
								->getTextContent(
 | 
						||
									$url[0]["attributes"]["href"]
 | 
						||
								)
 | 
						||
						];
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				//
 | 
						||
				// Parse image carousel
 | 
						||
				//
 | 
						||
				$is_image_carousel = false;
 | 
						||
				foreach($div as $d){
 | 
						||
					
 | 
						||
					if(
 | 
						||
						isset($d["attributes"]["class"]) &&
 | 
						||
						strpos($d["attributes"]["class"], "image-container") !== false
 | 
						||
					){
 | 
						||
						
 | 
						||
						$is_image_carousel = true;
 | 
						||
						break;
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				if($is_image_carousel){
 | 
						||
					
 | 
						||
					preg_match(
 | 
						||
						'/<!--s-data:([\S\s]*)-->/U',
 | 
						||
						$datafield["innerHTML"],
 | 
						||
						$matches
 | 
						||
					);
 | 
						||
					
 | 
						||
					if(isset($matches[1])){
 | 
						||
						
 | 
						||
						// weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong
 | 
						||
						$json =
 | 
						||
							$this->fuckhtml
 | 
						||
							->parseJsObject(
 | 
						||
								str_replace(
 | 
						||
									"-\-",
 | 
						||
									"--",
 | 
						||
									$matches[1]
 | 
						||
								)
 | 
						||
							);
 | 
						||
						
 | 
						||
						if(
 | 
						||
							$json !== null &&
 | 
						||
							isset($json["imageList"][0]["images"])
 | 
						||
						){
 | 
						||
							
 | 
						||
							// parse image carousel
 | 
						||
							foreach($json["imageList"][0]["images"] as $image){
 | 
						||
								
 | 
						||
								parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
 | 
						||
								
 | 
						||
								$out["image"][] = [
 | 
						||
									"title" => "image",
 | 
						||
									"source" => [
 | 
						||
										[
 | 
						||
											"url" => $image["objurl"],
 | 
						||
											"width" => (int)$image["width"],
 | 
						||
											"height" => (int)$image["height"]
 | 
						||
										],
 | 
						||
										[ // thumbnail
 | 
						||
											"url" => $image["thumburl"],
 | 
						||
											"width" => (int)$thumb_size["w"],
 | 
						||
											"height" => (int)$thumb_size["h"]
 | 
						||
										]
 | 
						||
									],
 | 
						||
									"url" => $image["jumpUrl"]
 | 
						||
								];
 | 
						||
							}
 | 
						||
						}
 | 
						||
					}
 | 
						||
				}
 | 
						||
				continue;
 | 
						||
			}
 | 
						||
			
 | 
						||
			if(!isset($datafield["attributes"]["mu"])){
 | 
						||
				
 | 
						||
				// dont scrape if we dont have the direct link
 | 
						||
				continue;
 | 
						||
			}
 | 
						||
			
 | 
						||
			// class:FYB_RD -> News garbage, IGNORE
 | 
						||
			
 | 
						||
			$result =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"result",
 | 
						||
					[$datafield]
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(count($result) !== 0){
 | 
						||
				
 | 
						||
				//
 | 
						||
				// Parse normal search result
 | 
						||
				//
 | 
						||
				
 | 
						||
				$title =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"sc-link",
 | 
						||
						"a"
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($title) === 0){
 | 
						||
					
 | 
						||
					// should not happen
 | 
						||
					continue;
 | 
						||
				}
 | 
						||
				
 | 
						||
				$title =
 | 
						||
					$this->titledots(
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$title[0]
 | 
						||
						)
 | 
						||
					);
 | 
						||
				
 | 
						||
				$description =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"c-color",
 | 
						||
						$div
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($description) !== 0){
 | 
						||
					
 | 
						||
					$this->fuckhtml->load($description[0]);
 | 
						||
					
 | 
						||
					$description =
 | 
						||
						$this->fuckhtml
 | 
						||
						->getElementsByAttributeName(
 | 
						||
							"class",
 | 
						||
							"span"
 | 
						||
						);
 | 
						||
					
 | 
						||
					$found_desc = false;
 | 
						||
					foreach($description as $desc){
 | 
						||
						
 | 
						||
						if(stripos($desc["attributes"]["class"], "summary-text") !== false){
 | 
						||
							
 | 
						||
							$found_desc = true;
 | 
						||
							$description =
 | 
						||
								$this->titledots(
 | 
						||
									$this->fuckhtml
 | 
						||
									->getTextContent(
 | 
						||
										$desc
 | 
						||
									)
 | 
						||
								);
 | 
						||
							break;
 | 
						||
						}
 | 
						||
					}
 | 
						||
					
 | 
						||
					if($found_desc === false){
 | 
						||
						
 | 
						||
						$description = null;
 | 
						||
					}
 | 
						||
					
 | 
						||
					$this->fuckhtml->load($datafield);
 | 
						||
				}else{
 | 
						||
					
 | 
						||
					$description = null;
 | 
						||
				}
 | 
						||
				
 | 
						||
				// parse date
 | 
						||
				$date_probe =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"cos-color-text-minor",
 | 
						||
						"span"
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($date_probe) !== 0){
 | 
						||
					
 | 
						||
					$date =
 | 
						||
						$this->parse_time(
 | 
						||
							$this->fuckhtml
 | 
						||
							->getTextContent(
 | 
						||
								$date_probe[0]
 | 
						||
							)
 | 
						||
						);
 | 
						||
				}else{
 | 
						||
					
 | 
						||
					$date = null;
 | 
						||
				}
 | 
						||
				
 | 
						||
				// parse image
 | 
						||
				$img =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByTagName(
 | 
						||
						"img"
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($img) !== 0){
 | 
						||
					
 | 
						||
					$image = [
 | 
						||
						"ratio" => "16:9",
 | 
						||
						"url" =>
 | 
						||
							$this->unfuckthumb(
 | 
						||
								$this->fuckhtml
 | 
						||
								->getTextContent(
 | 
						||
									$img[0]["attributes"]["src"]
 | 
						||
								)
 | 
						||
							)
 | 
						||
					];
 | 
						||
				}else{
 | 
						||
					
 | 
						||
					$image = [
 | 
						||
						"ratio" => null,
 | 
						||
						"url" => null
 | 
						||
					];
 | 
						||
				}
 | 
						||
				
 | 
						||
				// get page type
 | 
						||
				$pagetype_probe =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByTagName(
 | 
						||
						"b"
 | 
						||
					);
 | 
						||
				
 | 
						||
				$pagetype = "web";
 | 
						||
				foreach($pagetype_probe as $probe){
 | 
						||
					
 | 
						||
					$pagetype =
 | 
						||
						strtolower(
 | 
						||
							trim(
 | 
						||
								$this->fuckhtml
 | 
						||
								->getTextContent(
 | 
						||
									$probe
 | 
						||
								),
 | 
						||
								" 【】"
 | 
						||
							)
 | 
						||
						);
 | 
						||
				}
 | 
						||
				
 | 
						||
				// get extra links
 | 
						||
				$sublinks = [];
 | 
						||
				
 | 
						||
				foreach($div as $d){
 | 
						||
					
 | 
						||
					if(
 | 
						||
						isset($d["attributes"]["class"]) &&
 | 
						||
						strpos($d["attributes"]["class"], "exta-link") !== false
 | 
						||
					){
 | 
						||
						
 | 
						||
						$this->fuckhtml->load($d);
 | 
						||
						
 | 
						||
						$links =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getElementsByClassName(
 | 
						||
								"cos-space-mt-xs",
 | 
						||
								"div"
 | 
						||
							);
 | 
						||
						
 | 
						||
						foreach($links as $link){
 | 
						||
							
 | 
						||
							$this->fuckhtml->load($link);
 | 
						||
							$s_title =
 | 
						||
								$this->fuckhtml
 | 
						||
								->getElementsByTagName(
 | 
						||
									"h3"
 | 
						||
								);
 | 
						||
							
 | 
						||
							if(count($s_title) === 0){
 | 
						||
								
 | 
						||
								// should not happen
 | 
						||
								continue;
 | 
						||
							}
 | 
						||
							
 | 
						||
							$data2 =
 | 
						||
								json_decode(
 | 
						||
									$this->fuckhtml
 | 
						||
									->getTextContent(
 | 
						||
										$s_title[0]["attributes"]["data-click"]
 | 
						||
									),
 | 
						||
									true
 | 
						||
								);
 | 
						||
							
 | 
						||
							if(!isset($data2["clk_info"])){
 | 
						||
								
 | 
						||
								// wtf
 | 
						||
								continue;
 | 
						||
							}
 | 
						||
							
 | 
						||
							$data2 =
 | 
						||
								json_decode(
 | 
						||
									$data2["clk_info"],
 | 
						||
									true
 | 
						||
								);
 | 
						||
							
 | 
						||
							if(!isset($data2["url"])){
 | 
						||
								
 | 
						||
								// no link, fuck off
 | 
						||
								continue;
 | 
						||
							}
 | 
						||
							
 | 
						||
							$url =
 | 
						||
								rawurldecode(
 | 
						||
									$data2["url"]
 | 
						||
								);
 | 
						||
							
 | 
						||
							$data =
 | 
						||
								$this->fuckhtml
 | 
						||
								->getElementsByTagName(
 | 
						||
									"p"
 | 
						||
								);
 | 
						||
							
 | 
						||
							$s_description = null;
 | 
						||
							
 | 
						||
							if(count($data) !== 0){
 | 
						||
								
 | 
						||
								$data =
 | 
						||
									json_decode(
 | 
						||
										$this->fuckhtml
 | 
						||
										->getTextContent(
 | 
						||
											$data[0]["attributes"]["sub-show-log"]
 | 
						||
										),
 | 
						||
										true
 | 
						||
									);
 | 
						||
								
 | 
						||
								if(isset($data["ext"]["content"])){
 | 
						||
									
 | 
						||
									$s_description = $data["ext"]["content"];
 | 
						||
								}
 | 
						||
							}
 | 
						||
							
 | 
						||
							$sublinks[] = [
 | 
						||
								"title" =>
 | 
						||
									$this->fuckhtml
 | 
						||
									->getTextContent(
 | 
						||
										$s_title[0]
 | 
						||
									),
 | 
						||
								"description" => $s_description,
 | 
						||
								"url" => $url,
 | 
						||
								"date" => null
 | 
						||
							];
 | 
						||
						}
 | 
						||
						break;
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				$out["web"][] = [
 | 
						||
					"title" => $title,
 | 
						||
					"description" => $description,
 | 
						||
					"url" =>
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$datafield["attributes"]["mu"]
 | 
						||
						),
 | 
						||
					"date" => $date,
 | 
						||
					"type" => $pagetype,
 | 
						||
					"thumb" => $image,
 | 
						||
					"sublink" => $sublinks,
 | 
						||
					"table" => []
 | 
						||
				];
 | 
						||
				
 | 
						||
				continue;
 | 
						||
			}
 | 
						||
			
 | 
						||
			// parse special result
 | 
						||
			$result =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"result-op",
 | 
						||
					[$datafield]
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(count($result) !== 0){
 | 
						||
				
 | 
						||
				//
 | 
						||
				// Parse video carousel
 | 
						||
				//
 | 
						||
				if(
 | 
						||
					isset($datafield["attributes"]["tpl"]) &&
 | 
						||
					stripos($datafield["attributes"]["tpl"], "video") !== false
 | 
						||
				){
 | 
						||
					
 | 
						||
					preg_match(
 | 
						||
						'/<!--s-data:([\S\s]*)-->/U',
 | 
						||
						$datafield["innerHTML"],
 | 
						||
						$matches
 | 
						||
					);
 | 
						||
					
 | 
						||
					if(isset($matches[1])){
 | 
						||
					
 | 
						||
						$json =
 | 
						||
							json_decode(
 | 
						||
								$matches[1],
 | 
						||
								true
 | 
						||
							);
 | 
						||
						
 | 
						||
						if($json !== null){
 | 
						||
							
 | 
						||
							foreach($json["videoList"] as $video){
 | 
						||
								
 | 
						||
								$out["video"][] = [
 | 
						||
									"title" => $video["title"],
 | 
						||
									"description" =>
 | 
						||
										$this->titledots(
 | 
						||
											$video["desc"]
 | 
						||
										),
 | 
						||
									"date" =>
 | 
						||
										$this->parse_time(
 | 
						||
											$video["pubTime"]
 | 
						||
										),
 | 
						||
									"duration" =>
 | 
						||
										$this->hms2int(
 | 
						||
											$video["duration"]
 | 
						||
										),
 | 
						||
									"views" =>
 | 
						||
										$this->parse_viewcount(
 | 
						||
											$video["playCount"]
 | 
						||
										),
 | 
						||
									"thumb" => [
 | 
						||
										"ratio" => "16:9",
 | 
						||
										"url" => $video["poster"]
 | 
						||
									],
 | 
						||
									"url" => $video["bindProps"]["link"]
 | 
						||
								];
 | 
						||
							}
 | 
						||
						}
 | 
						||
					}
 | 
						||
					continue;
 | 
						||
				}
 | 
						||
				
 | 
						||
				//
 | 
						||
				// Special result div (wiki entries, rich divs)
 | 
						||
				//
 | 
						||
				$title =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByTagName(
 | 
						||
						"h3"
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($title) === 0){
 | 
						||
					
 | 
						||
					// should have a title somewhere
 | 
						||
					continue;
 | 
						||
				}
 | 
						||
				
 | 
						||
				$title =
 | 
						||
					explode(
 | 
						||
						">",
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$title[0]
 | 
						||
						),
 | 
						||
						2
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($title) === 2){
 | 
						||
					
 | 
						||
					$title = $title[1];
 | 
						||
				}else{
 | 
						||
					
 | 
						||
					$title = $title[0];
 | 
						||
				}
 | 
						||
				
 | 
						||
				// probe for wiki-like entry
 | 
						||
				$description =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"sc-paragraph",
 | 
						||
						"p"
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($description) === 0){
 | 
						||
					
 | 
						||
					// try and get grey description
 | 
						||
					$description =
 | 
						||
						$this->fuckhtml
 | 
						||
						->getElementsByClassName(
 | 
						||
							"c-color-gray2",
 | 
						||
							"p"
 | 
						||
						);
 | 
						||
					
 | 
						||
					if(count($description) === 0){
 | 
						||
						
 | 
						||
						// probe for special social media description
 | 
						||
						$description =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getElementsByClassName(
 | 
						||
								"c-color-text",
 | 
						||
								"div"
 | 
						||
							);
 | 
						||
						
 | 
						||
						if(isset($description[0]["attributes"]["aria-label"])){
 | 
						||
							
 | 
						||
							$description =
 | 
						||
								$this->fuckhtml
 | 
						||
								->getTextContent(
 | 
						||
									$description[0]
 | 
						||
									["attributes"]
 | 
						||
									["aria-label"]
 | 
						||
								);
 | 
						||
						}else{
 | 
						||
							
 | 
						||
							// check for news tab description
 | 
						||
							$span =
 | 
						||
								$this->fuckhtml
 | 
						||
								->getElementsByClassName(
 | 
						||
									"c-font-normal",
 | 
						||
									"span"
 | 
						||
								);
 | 
						||
							
 | 
						||
							$description = null;
 | 
						||
							
 | 
						||
							foreach($span as $s){
 | 
						||
								
 | 
						||
								if(isset($s["attributes"]["aria-label"])){
 | 
						||
									
 | 
						||
									$description =
 | 
						||
										$this->titledots(
 | 
						||
											$this->fuckhtml
 | 
						||
											->getTextContent(
 | 
						||
												$span[count($span) - 1]
 | 
						||
											)
 | 
						||
										);
 | 
						||
									
 | 
						||
									break;
 | 
						||
								}
 | 
						||
							}
 | 
						||
						}
 | 
						||
					}else{
 | 
						||
						
 | 
						||
						$description =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getTextContent(
 | 
						||
								$description[0]
 | 
						||
							);
 | 
						||
					}
 | 
						||
					
 | 
						||
				}else{
 | 
						||
					
 | 
						||
					preg_match(
 | 
						||
						'/<!--s-text-->([\S\s]*)<!--\/s-text-->/U',
 | 
						||
						$description[count($description) - 1]["innerHTML"],
 | 
						||
						$matches
 | 
						||
					);
 | 
						||
					
 | 
						||
					if(isset($matches[1])){
 | 
						||
						
 | 
						||
						$description =
 | 
						||
							$this->titledots(
 | 
						||
								$this->fuckhtml
 | 
						||
								->getTextContent(
 | 
						||
									$matches[1]
 | 
						||
								)
 | 
						||
							);
 | 
						||
					}else{
 | 
						||
						
 | 
						||
						$description = null;
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				// get thumbnail
 | 
						||
				$thumb =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByTagName(
 | 
						||
						"img"
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($thumb) !== 0){
 | 
						||
					
 | 
						||
					$thumb = [
 | 
						||
						"ratio" => "1:1",
 | 
						||
						"url" =>
 | 
						||
							$this->unfuckthumb(
 | 
						||
								$this->fuckhtml
 | 
						||
								->getTextContent(
 | 
						||
									$thumb[0]["attributes"]["src"]
 | 
						||
								)
 | 
						||
							)
 | 
						||
					];
 | 
						||
				}else{
 | 
						||
					
 | 
						||
					$thumb = [
 | 
						||
						"ratio" => null,
 | 
						||
						"url" => null
 | 
						||
					];
 | 
						||
				}
 | 
						||
				
 | 
						||
				// get sublinks
 | 
						||
				preg_match(
 | 
						||
					'/<!--s-data:([\S\s]*)-->/U',
 | 
						||
					$datafield["innerHTML"],
 | 
						||
					$matches
 | 
						||
				);
 | 
						||
				
 | 
						||
				$sublinks = [];
 | 
						||
				
 | 
						||
				if(isset($matches[1])){
 | 
						||
					
 | 
						||
					$json =
 | 
						||
						json_decode(
 | 
						||
							$matches[1],
 | 
						||
							true
 | 
						||
						);
 | 
						||
					
 | 
						||
					if($json !== null){
 | 
						||
						
 | 
						||
						if(isset($json["buttons"])){
 | 
						||
							
 | 
						||
							foreach($json["buttons"] as $button){
 | 
						||
								
 | 
						||
								$sublinks[] = [
 | 
						||
									"title" => $button["text"],
 | 
						||
									"description" => null,
 | 
						||
									"date" => null,
 | 
						||
									"url" => $button["url"]
 | 
						||
								];
 | 
						||
							}
 | 
						||
						}elseif(isset($json["mthreadList"])){
 | 
						||
							
 | 
						||
							foreach($json["mthreadList"] as $thread){
 | 
						||
								
 | 
						||
								$sublinks[] = [
 | 
						||
									"title" =>
 | 
						||
										$this->fuckhtml
 | 
						||
										->getTextContent(
 | 
						||
											$thread["title"]
 | 
						||
										),
 | 
						||
									"description" => null,
 | 
						||
									"date" => null,
 | 
						||
									"url" => $thread["ttsInfo"]["titleUrl"]
 | 
						||
								];
 | 
						||
							}
 | 
						||
						}
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				// get URL
 | 
						||
				// handle http://fakeurl.baidu.com bullshit
 | 
						||
				$url =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getTextContent(
 | 
						||
						$datafield["attributes"]["mu"]
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(
 | 
						||
					preg_match(
 | 
						||
						'/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/',
 | 
						||
						$url
 | 
						||
					)
 | 
						||
				){
 | 
						||
					
 | 
						||
					// we got some bullshit, get jumpUrl instead
 | 
						||
					$as =
 | 
						||
						$this->fuckhtml
 | 
						||
						->getElementsByTagName(
 | 
						||
							"a"
 | 
						||
						);
 | 
						||
					
 | 
						||
					if(count($as) !== 0){
 | 
						||
						
 | 
						||
						$url =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getTextContent(
 | 
						||
								$as[0]["attributes"]["href"]
 | 
						||
							);
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				// get xueshu sublinks
 | 
						||
				// get list
 | 
						||
				$xueshu_list =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"op-xueshu-links-d20-list",
 | 
						||
						$div
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($xueshu_list) !== 0){
 | 
						||
					
 | 
						||
					$this->fuckhtml->load($xueshu_list[0]);
 | 
						||
					
 | 
						||
					$rows =
 | 
						||
						$this->fuckhtml
 | 
						||
						->getElementsByClassName(
 | 
						||
							"c-row",
 | 
						||
							"div"
 | 
						||
						);
 | 
						||
					
 | 
						||
					// remove "read more" bullshit
 | 
						||
					foreach($rows as $row){
 | 
						||
						
 | 
						||
						if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
 | 
						||
							
 | 
						||
							$xueshu_list[0]["innerHTML"] =
 | 
						||
								str_replace(
 | 
						||
									$row["outerHTML"],
 | 
						||
									"",
 | 
						||
									$xueshu_list[0]["innerHTML"]
 | 
						||
								);
 | 
						||
						}
 | 
						||
					}
 | 
						||
					
 | 
						||
					$this->fuckhtml->load($xueshu_list[0]);
 | 
						||
					
 | 
						||
					foreach($rows as $row){
 | 
						||
						
 | 
						||
						$this->fuckhtml->load($row);
 | 
						||
						
 | 
						||
						if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
 | 
						||
							
 | 
						||
							continue;
 | 
						||
						}
 | 
						||
						
 | 
						||
						$as =
 | 
						||
							$this->fuckhtml
 | 
						||
							->getElementsByTagName(
 | 
						||
								"a"
 | 
						||
							);
 | 
						||
						
 | 
						||
						foreach($as as $a){
 | 
						||
							
 | 
						||
							$sublinks[] = [
 | 
						||
								"title" =>
 | 
						||
									$this->titledots(
 | 
						||
										$this->fuckhtml
 | 
						||
										->getTextContent(
 | 
						||
											$a
 | 
						||
										)
 | 
						||
									),
 | 
						||
								"description" => null,
 | 
						||
								"date" => null,
 | 
						||
								"url" =>
 | 
						||
									$this->fuckhtml
 | 
						||
									->getTextContent(
 | 
						||
										$a["attributes"]["href"]
 | 
						||
									)
 | 
						||
							];
 | 
						||
						}
 | 
						||
					}
 | 
						||
				}
 | 
						||
				
 | 
						||
				$out["web"][] = [
 | 
						||
					"title" => $title,
 | 
						||
					"description" => $description,
 | 
						||
					"url" => $url,
 | 
						||
					"date" => null,
 | 
						||
					"type" => "web",
 | 
						||
					"thumb" => $thumb,
 | 
						||
					"sublink" => $sublinks,
 | 
						||
					"table" => []
 | 
						||
				];
 | 
						||
				continue;
 | 
						||
			}
 | 
						||
		}
 | 
						||
		
 | 
						||
		//
 | 
						||
		// Remove tracking URLs and fetch additonal image resources
 | 
						||
		//
 | 
						||
		$this->resolve_urls($proxy, $out, ["web", "video"]);
 | 
						||
		$this->resolve_images($proxy, $out);
 | 
						||
		
 | 
						||
		return $out;
 | 
						||
	}
 | 
						||
	
 | 
						||
	public function image($get){
 | 
						||
		
 | 
						||
		// https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1
 | 
						||
		//$json = file_get_contents("scraper/baidu_img.json");
 | 
						||
		
 | 
						||
		if($get["npt"]){
 | 
						||
			
 | 
						||
			[$params, $proxy] = $this->backend->get($get["npt"], "images");
 | 
						||
			$params = json_decode($params, true);
 | 
						||
			
 | 
						||
			$params["pn"] = $params["pn"] + 60;
 | 
						||
			
 | 
						||
		}else{
 | 
						||
			
 | 
						||
			$proxy = $this->backend->get_ip();
 | 
						||
			$params = [
 | 
						||
				"word" => $get["s"],
 | 
						||
				"rn" => 60, // results/page
 | 
						||
				"pn" => 0, // item increment (0 * 60)
 | 
						||
				"newReq" => 1 // otherwise json is fucked up
 | 
						||
			];
 | 
						||
			
 | 
						||
			switch($get["sort"]){
 | 
						||
				
 | 
						||
				case "latest": $params["latest"] = 1; break;
 | 
						||
				case "hot": $params["hot"] = 1; break;
 | 
						||
			}
 | 
						||
			
 | 
						||
			if($get["size"] != "any"){
 | 
						||
				
 | 
						||
				$params["z"] = $get["size"];
 | 
						||
			}
 | 
						||
			
 | 
						||
			if($get["ratio"] != "any"){
 | 
						||
				
 | 
						||
				$params["imgratio"] = $get["ratio"];
 | 
						||
			}
 | 
						||
			
 | 
						||
			if($get["format"] != "any"){
 | 
						||
				
 | 
						||
				$params["imgformat"] = $get["format"];
 | 
						||
			}
 | 
						||
			
 | 
						||
			if($get["color"] != "any"){
 | 
						||
				
 | 
						||
				$params["ic"] = $get["color"];
 | 
						||
			}
 | 
						||
			
 | 
						||
			switch($get["type"]){
 | 
						||
				
 | 
						||
				case "hd": $params["hd"] = 1; break;
 | 
						||
				case "isImgSet": $params["isImgSet"] = 1; break;
 | 
						||
				case "copyright": $params["copyright"] = 1; break;
 | 
						||
			}
 | 
						||
		}
 | 
						||
		
 | 
						||
		try{
 | 
						||
				
 | 
						||
			$json =
 | 
						||
				$this->get(
 | 
						||
					$proxy,
 | 
						||
					"https://image.baidu.com/search/acjson",
 | 
						||
					$params,
 | 
						||
					"https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"])
 | 
						||
				);
 | 
						||
		}catch(Exception $error){
 | 
						||
			
 | 
						||
			throw new Exception("Failed to fetch JSON");
 | 
						||
		}
 | 
						||
		
 | 
						||
		$json = json_decode($json, true);
 | 
						||
		
 | 
						||
		if($json === null){
 | 
						||
			
 | 
						||
			// detect captcha first			
 | 
						||
			$this->fuckhtml->load($json);
 | 
						||
			$this->detect_ass();
 | 
						||
			
 | 
						||
			// fallback to json decode error
 | 
						||
			throw new Exception("Failed to decode JSON");
 | 
						||
		}
 | 
						||
		
 | 
						||
		if(
 | 
						||
			isset($json["message"]) &&
 | 
						||
			$json["message"] != "success"
 | 
						||
		){
 | 
						||
			
 | 
						||
			throw new Exception("Baidu returned an error: {$json["message"]}");
 | 
						||
		}
 | 
						||
		
 | 
						||
		if(!isset($json["data"]["images"])){
 | 
						||
			
 | 
						||
			throw new Exception("Baidu did not return an image object");
 | 
						||
		}
 | 
						||
		
 | 
						||
		$out = [
 | 
						||
			"status" => "ok",
 | 
						||
			"npt" => null,
 | 
						||
			"image" => []
 | 
						||
		];
 | 
						||
		
 | 
						||
		foreach($json["data"]["images"] as $image){
 | 
						||
			
 | 
						||
			parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
 | 
						||
			
 | 
						||
			$out["image"][] = [
 | 
						||
				"title" =>
 | 
						||
					$this->fuckhtml
 | 
						||
					->getTextContent(
 | 
						||
						$image["titleShow"]
 | 
						||
					),
 | 
						||
				"source" => [
 | 
						||
					[
 | 
						||
						"url" => $image["objurl"],
 | 
						||
						"width" => (int)$image["width"],
 | 
						||
						"height" => (int)$image["height"]
 | 
						||
					],
 | 
						||
					[ // thumbnail
 | 
						||
						"url" => $image["thumburl"],
 | 
						||
						"width" => (int)$thumb_size["w"],
 | 
						||
						"height" => (int)$thumb_size["h"]
 | 
						||
					]
 | 
						||
				],
 | 
						||
				"url" => $image["fromUrl"]
 | 
						||
			];
 | 
						||
		}
 | 
						||
		
 | 
						||
		//
 | 
						||
		// Detect if there's a next page
 | 
						||
		//
 | 
						||
		if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){
 | 
						||
			
 | 
						||
			$out["npt"] =
 | 
						||
				$this->backend->store(
 | 
						||
					json_encode($params),
 | 
						||
					"images",
 | 
						||
					$proxy
 | 
						||
				);
 | 
						||
		}
 | 
						||
		
 | 
						||
		return $out;
 | 
						||
	}
 | 
						||
	
 | 
						||
	public function video($get){
 | 
						||
		
 | 
						||
		// https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0
 | 
						||
		// increase &pn +20 for pagination
 | 
						||
		
 | 
						||
		//$html = file_get_contents("scraper/baidu_vid.html");
 | 
						||
		
 | 
						||
		if($get["npt"]){
 | 
						||
			
 | 
						||
			[$params, $proxy] = $this->backend->get($get["npt"], "videos");
 | 
						||
			$params = json_decode($params, true);
 | 
						||
			
 | 
						||
			$params["pn"] = $params["pn"] + 10;
 | 
						||
		}else{
 | 
						||
			
 | 
						||
			$proxy = $this->backend->get_ip();
 | 
						||
			$params = [
 | 
						||
				"pd" => "video",
 | 
						||
				"tn" => "vsearch",
 | 
						||
				"wd" => $get["s"],
 | 
						||
				"async" => 1,
 | 
						||
				"pn" => 0
 | 
						||
			];
 | 
						||
		}
 | 
						||
		
 | 
						||
		try{
 | 
						||
			$html =
 | 
						||
				$this->get(
 | 
						||
					$proxy,
 | 
						||
					"https://www.baidu.com/sf/vsearch",
 | 
						||
					$params
 | 
						||
				);
 | 
						||
		}catch(Exception $error){
 | 
						||
			
 | 
						||
			throw new Exception("Failed to get search page");
 | 
						||
		}
 | 
						||
		
 | 
						||
		$html =
 | 
						||
			str_replace(
 | 
						||
				["\r", "\n"],
 | 
						||
				"",
 | 
						||
				$html
 | 
						||
			);
 | 
						||
		
 | 
						||
		$out = [
 | 
						||
			"status" => "ok",
 | 
						||
			"npt" => null,
 | 
						||
			"video" => [],
 | 
						||
			"author" => [],
 | 
						||
			"livestream" => [],
 | 
						||
			"playlist" => [],
 | 
						||
			"reel" => []
 | 
						||
		];
 | 
						||
		
 | 
						||
		$html = explode("<script>", $html);
 | 
						||
		
 | 
						||
		foreach($html as $result){
 | 
						||
			
 | 
						||
			$result = trim($result);
 | 
						||
			
 | 
						||
			$this->fuckhtml->load($result);
 | 
						||
			
 | 
						||
			// get URL
 | 
						||
			preg_match(
 | 
						||
				'/<!-- *([^ ]*) *-->/',
 | 
						||
				$result,
 | 
						||
				$matches
 | 
						||
			);
 | 
						||
			
 | 
						||
			if(!isset($matches[1])){
 | 
						||
				
 | 
						||
				// no link, give up
 | 
						||
				continue;
 | 
						||
			}
 | 
						||
			
 | 
						||
			$link = $matches[1];
 | 
						||
			
 | 
						||
			// get title
 | 
						||
			$title =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"video-title",
 | 
						||
					"a"
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(count($title) === 0){
 | 
						||
				
 | 
						||
				// should not happen
 | 
						||
				continue;
 | 
						||
			}
 | 
						||
			
 | 
						||
			$title =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getTextContent(
 | 
						||
					$title[0]
 | 
						||
				);
 | 
						||
			
 | 
						||
			// get thumbnail
 | 
						||
			$img =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"border-radius",
 | 
						||
					"img"
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(count($img) !== 0){
 | 
						||
				
 | 
						||
				$thumb = [
 | 
						||
					"url" =>
 | 
						||
						$this->unfuckthumb(
 | 
						||
							$this->fuckhtml
 | 
						||
							->getTextContent(
 | 
						||
								$img[0]["attributes"]["src"]
 | 
						||
							)
 | 
						||
						),
 | 
						||
					"ratio" => "16:9"
 | 
						||
				];
 | 
						||
			}else{
 | 
						||
				
 | 
						||
				$thumb = [
 | 
						||
					"url" => null,
 | 
						||
					"ratio" => null
 | 
						||
				];
 | 
						||
			}
 | 
						||
			
 | 
						||
			$span =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByTagName(
 | 
						||
					"span"
 | 
						||
				);
 | 
						||
			
 | 
						||
			// get duration
 | 
						||
			$duration =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"video_play_timer",
 | 
						||
					$span
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(count($duration) !== 0){
 | 
						||
				
 | 
						||
				$duration =
 | 
						||
					$this->hms2int(
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$duration[0]
 | 
						||
						)
 | 
						||
					);
 | 
						||
			}else{
 | 
						||
				
 | 
						||
				$duration = null;
 | 
						||
			}
 | 
						||
			
 | 
						||
			// get author
 | 
						||
			// 来源:哔哩哔哩
 | 
						||
			$author =
 | 
						||
				$this->fuckhtml
 | 
						||
				->getElementsByClassName(
 | 
						||
					"wetSource",
 | 
						||
					$span
 | 
						||
				);
 | 
						||
			
 | 
						||
			if(count($author) !== 0){
 | 
						||
				
 | 
						||
				$author =
 | 
						||
					explode(
 | 
						||
						":",
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$author[0]
 | 
						||
						),
 | 
						||
						2
 | 
						||
					)[1];
 | 
						||
			}else{
 | 
						||
				
 | 
						||
				$author = null;
 | 
						||
			}
 | 
						||
			
 | 
						||
			// get date posted
 | 
						||
			//发布时间:2024-05-06
 | 
						||
			
 | 
						||
			// AND get description
 | 
						||
			// 简介:Our first look
 | 
						||
			$infospans =
 | 
						||
				array_merge(
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"c-font-normal",
 | 
						||
						$span
 | 
						||
					),
 | 
						||
					$this->fuckhtml
 | 
						||
					->getElementsByClassName(
 | 
						||
						"c-font-normal",
 | 
						||
						"div"
 | 
						||
					)
 | 
						||
				);
 | 
						||
			
 | 
						||
			$date = null;
 | 
						||
			$description = null;
 | 
						||
			
 | 
						||
			foreach($infospans as $infospan){
 | 
						||
				
 | 
						||
				$infospan =
 | 
						||
					explode(
 | 
						||
						":",
 | 
						||
						$this->fuckhtml
 | 
						||
						->getTextContent(
 | 
						||
							$infospan
 | 
						||
						),
 | 
						||
						2
 | 
						||
					);
 | 
						||
				
 | 
						||
				if(count($infospan) !== 2){
 | 
						||
					
 | 
						||
					// should not happen
 | 
						||
					continue;
 | 
						||
				}
 | 
						||
				
 | 
						||
				$infospan[1] =
 | 
						||
					$this->fuckhtml
 | 
						||
					->getTextContent(
 | 
						||
						$infospan[1]
 | 
						||
					);
 | 
						||
				
 | 
						||
				switch($infospan[0]){
 | 
						||
					
 | 
						||
					case "发布时间": // date posted
 | 
						||
						$date = $this->parse_time($infospan[1]);
 | 
						||
						break;
 | 
						||
					
 | 
						||
					case "简介": // description
 | 
						||
						$description = $infospan[1];
 | 
						||
						break;
 | 
						||
				}
 | 
						||
			}
 | 
						||
			
 | 
						||
			$out["video"][] = [
 | 
						||
				"title" => $this->titledots($title),
 | 
						||
				"description" => $this->titledots($description),
 | 
						||
				"author" => [
 | 
						||
					"name" => $author,
 | 
						||
					"url" => null,
 | 
						||
					"avatar" => null
 | 
						||
				],
 | 
						||
				"date" => $date,
 | 
						||
				"duration" => $duration,
 | 
						||
				"views" => null,
 | 
						||
				"thumb" => $thumb,
 | 
						||
				"url" => $link
 | 
						||
			];
 | 
						||
		}
 | 
						||
		
 | 
						||
		if(count($out["video"]) === 10){
 | 
						||
			
 | 
						||
			// assume there's another page after this
 | 
						||
			$out["npt"] =
 | 
						||
				$this->backend->store(
 | 
						||
					json_encode($params),
 | 
						||
					"videos",
 | 
						||
					$proxy
 | 
						||
				);
 | 
						||
		}
 | 
						||
		
 | 
						||
		return $out;
 | 
						||
	}
 | 
						||
	
 | 
						||
	public function news($get){
 | 
						||
		
 | 
						||
		//$proxy = $this->backend->get_ip();
 | 
						||
		//$html = file_get_contents("scraper/baidu.html");
 | 
						||
		//$npt_data = [];
 | 
						||
		
 | 
						||
		if($get["npt"]){
 | 
						||
			
 | 
						||
			[$json, $proxy] = $this->backend->get($get["npt"], "news");
 | 
						||
			
 | 
						||
			$json = json_decode($json, true);
 | 
						||
			$this->cookie = $json["cookie"];
 | 
						||
			$npt_data = $json["req"];
 | 
						||
			
 | 
						||
			$npt_data["pn"] = $npt_data["pn"] + 20;
 | 
						||
			
 | 
						||
			try{
 | 
						||
				
 | 
						||
				$html = $this->get(
 | 
						||
					$proxy,
 | 
						||
					"https://www.baidu.com/s",
 | 
						||
					$npt_data
 | 
						||
				);
 | 
						||
			}catch(Exception $error){
 | 
						||
				
 | 
						||
				throw new Exception("Failed to fetch search page");
 | 
						||
			}
 | 
						||
			
 | 
						||
		}else{
 | 
						||
			
 | 
						||
			//
 | 
						||
			// Get authentication token
 | 
						||
			//
 | 
						||
			$proxy = $this->backend->get_ip();
 | 
						||
			
 | 
						||
			$npt_data = [
 | 
						||
				"wd" => $get["s"],
 | 
						||
				"rn" => 20,
 | 
						||
				"tn" => "news"
 | 
						||
			];
 | 
						||
			
 | 
						||
			// @TODO add filters
 | 
						||
			
 | 
						||
			try{
 | 
						||
				
 | 
						||
				$html = $this->get(
 | 
						||
					$proxy,
 | 
						||
					"https://www.baidu.com/s",
 | 
						||
					$npt_data
 | 
						||
				);
 | 
						||
			}catch(Exception $error){
 | 
						||
				
 | 
						||
				throw new Exception("Failed to fetch search page");
 | 
						||
			}
 | 
						||
			
 | 
						||
			$npt_data["pn"] = 0;
 | 
						||
		}
 | 
						||
		
 | 
						||
		$data = $this->parse_search($proxy, "news", $npt_data, $html);
 | 
						||
		
 | 
						||
		$out = [
 | 
						||
			"status" => "ok",
 | 
						||
			"npt" => $data["npt"],
 | 
						||
			"news" => []
 | 
						||
		];
 | 
						||
		
 | 
						||
		foreach($data["web"] as $article){
 | 
						||
			
 | 
						||
			$out["news"][] = [
 | 
						||
				"title" => $article["title"],
 | 
						||
				"author" => null,
 | 
						||
				"description" => $article["description"],
 | 
						||
				"date" => $article["date"],
 | 
						||
				"thumb" => [
 | 
						||
					"url" => $article["thumb"]["url"],
 | 
						||
					"ratio" => $article["thumb"]["url"] !== null ? "16:9" : null,
 | 
						||
				],
 | 
						||
				"url" => $article["url"]
 | 
						||
			];
 | 
						||
		}
 | 
						||
		
 | 
						||
		return $out;
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function unfuckthumb($url){
 | 
						||
		
 | 
						||
		// probe for proxy URL
 | 
						||
		$parsed_url = parse_url($url);
 | 
						||
		if(
 | 
						||
			preg_match(
 | 
						||
				'/^https?:\/\/gimg(?:[0-9]+)?\.baidu\.com/',
 | 
						||
				$url
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			$parts = explode("src=", $url);
 | 
						||
			if(count($parts) !== 2){
 | 
						||
				
 | 
						||
				// shits fucked
 | 
						||
				return $url;
 | 
						||
			}
 | 
						||
			
 | 
						||
			return urldecode(explode("&", $parts[1])[0]);
 | 
						||
		}
 | 
						||
		
 | 
						||
		$q = explode("&", $url, 2);
 | 
						||
		
 | 
						||
		if(count($q) !== 2){
 | 
						||
			
 | 
						||
			// shits fucked, again
 | 
						||
			return $url;
 | 
						||
		}
 | 
						||
		
 | 
						||
		// baidu devs are fucking retarded and dont follow spec:
 | 
						||
		// &fmt=auto?s=BB32F3A050471AEC72886934030090C4&sec=1753203600&t=0fb2194775d3bd3d1bb114b818479e0a
 | 
						||
		parse_str(str_replace("?", "&", $q[1]), $query);
 | 
						||
		
 | 
						||
		if(isset($query["size"])){ unset($query["size"]); }
 | 
						||
		if(isset($query["q"])){ $query["q"] = "90"; }
 | 
						||
		
 | 
						||
		$query = http_build_query($query);
 | 
						||
		
 | 
						||
		return
 | 
						||
			str_replace(
 | 
						||
				$q[1],
 | 
						||
				$query,
 | 
						||
				$url
 | 
						||
			);
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function titledots($title){
 | 
						||
		
 | 
						||
		return trim($title, " .\t\n\r\0\x0B…");
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function hms2int($time){
 | 
						||
		
 | 
						||
		$parts = explode(":", $time, 3);
 | 
						||
		$time = 0;
 | 
						||
		
 | 
						||
		if(count($parts) === 3){
 | 
						||
			
 | 
						||
			// hours
 | 
						||
			$time = $time + ((int)$parts[0] * 3600);
 | 
						||
			array_shift($parts);
 | 
						||
		}
 | 
						||
		
 | 
						||
		if(count($parts) === 2){
 | 
						||
			
 | 
						||
			// minutes
 | 
						||
			$time = $time + ((int)$parts[0] * 60);
 | 
						||
			array_shift($parts);
 | 
						||
		}
 | 
						||
		
 | 
						||
		// seconds
 | 
						||
		$time = $time + (int)$parts[0];
 | 
						||
		
 | 
						||
		return $time;
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function parse_viewcount($views){
 | 
						||
		
 | 
						||
		if(
 | 
						||
			// 10k (wtf lol)
 | 
						||
			preg_match(
 | 
						||
				'/([0-9]+)万次/',
 | 
						||
				$views,
 | 
						||
				$matches
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			return (int)$matches[1] * 10000;
 | 
						||
		}
 | 
						||
		
 | 
						||
		if(
 | 
						||
			// units
 | 
						||
			preg_match(
 | 
						||
				'/([0-9]+)次/',
 | 
						||
				$views,
 | 
						||
				$matches
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			return (int)$matches[1];
 | 
						||
		}
 | 
						||
		
 | 
						||
		return null;
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function parse_time($time){
 | 
						||
		
 | 
						||
		// 2023年8月7日 => yyyy/m/d
 | 
						||
		if(
 | 
						||
			preg_match(
 | 
						||
				'/([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日/',
 | 
						||
				$time,
 | 
						||
				$matches
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			return strtotime("{$matches[1]}/{$matches[2]}/{$matches[3]}");
 | 
						||
		}
 | 
						||
		
 | 
						||
		// 昨天11:45 => yesterday at 11:45
 | 
						||
		// 昨天 => yesterday
 | 
						||
		if(
 | 
						||
			preg_match(
 | 
						||
				'/昨天(.*)/',
 | 
						||
				$time,
 | 
						||
				$matches
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			return strtotime("Yesterday {$matches[1]}");
 | 
						||
		}
 | 
						||
		
 | 
						||
		// 3天前 => 3 days ago
 | 
						||
		if(
 | 
						||
			preg_match(
 | 
						||
				'/([0-9]{1,4})天前/',
 | 
						||
				$time,
 | 
						||
				$matches
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			return strtotime("{$matches[1]} days ago");
 | 
						||
		}
 | 
						||
		
 | 
						||
		// 1个月前 => 1 month ago
 | 
						||
		if(
 | 
						||
			preg_match(
 | 
						||
				'/([0-9]{1,4})个月前/',
 | 
						||
				$time,
 | 
						||
				$matches
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			return strtotime("{$matches[1]} months ago");
 | 
						||
		}
 | 
						||
		
 | 
						||
		// attempt to parse as-is
 | 
						||
		$time = strtotime($time);
 | 
						||
		
 | 
						||
		if($time !== false){
 | 
						||
			
 | 
						||
			return $time;
 | 
						||
		}
 | 
						||
		
 | 
						||
		return null;
 | 
						||
	}
 | 
						||
	
 | 
						||
	private function detect_ass(){
 | 
						||
		
 | 
						||
		$as =
 | 
						||
			$this->fuckhtml
 | 
						||
			->getElementsByTagName(
 | 
						||
				"a"
 | 
						||
			);
 | 
						||
		
 | 
						||
		if(
 | 
						||
			count($as) === 0 ||
 | 
						||
			preg_match(
 | 
						||
				'/^https?:\/\/wappass\.baidu\.com\/static\/captcha/',
 | 
						||
				$this->fuckhtml
 | 
						||
				->getTextContent(
 | 
						||
					$as[0]["attributes"]["href"]
 | 
						||
				)
 | 
						||
			)
 | 
						||
		){
 | 
						||
			
 | 
						||
			throw new Exception("Baidu returned a Captcha");
 | 
						||
		}
 | 
						||
	}
 | 
						||
}
 |