2230 lines
42 KiB
PHP
2230 lines
42 KiB
PHP
<?php
|
||
|
||
class baidu{
|
||
|
||
public function __construct(){
|
||
|
||
include "lib/backend.php";
|
||
$this->backend = new backend("baidu");
|
||
|
||
include "lib/fuckhtml.php";
|
||
$this->fuckhtml = new fuckhtml();
|
||
|
||
$this->handles = [];
|
||
$this->proc = null;
|
||
$this->handle_category = null;
|
||
$this->handle_increment = 0;
|
||
$this->sublink_increment = 0;
|
||
|
||
$this->cookie = null;
|
||
}
|
||
|
||
public function getfilters($page){
|
||
|
||
switch($page){
|
||
|
||
case "web":
|
||
return
|
||
[
|
||
"newer" => [
|
||
"display" => "Newer than",
|
||
"option" => "_DATE"
|
||
],
|
||
"older" => [
|
||
"display" => "Older than",
|
||
"option" => "_DATE"
|
||
]
|
||
];
|
||
break;
|
||
|
||
case "images":
|
||
return
|
||
[
|
||
"sort" => [
|
||
"display" => "Sort",
|
||
"option" => [
|
||
"relevance" => "Relevance", // no param
|
||
"latest" => "Latest", // &latest=1
|
||
"hot" => "Hot" // &hot=1
|
||
]
|
||
],
|
||
"size" => [
|
||
"display" => "Size",
|
||
"option" => [
|
||
"any" => "Any size",
|
||
"7" => "Extra large (1080px+)", // &z=7
|
||
"6" => "Large (600px~1080px)", // &z=6
|
||
"5" => "Medium (300px~600px)", // &z=5
|
||
"4" => "Small (1px~300px)" // &z=4
|
||
]
|
||
],
|
||
"ratio" => [
|
||
"display" => "Ratio",
|
||
"option" => [
|
||
"any" => "Any ratio",
|
||
"1" => "Tall vertical", // &imgratio=1
|
||
"2" => "Vertical", // &imgratio=2
|
||
"3" => "Square", // &imgratio=3
|
||
"4" => "Horizontal", // &imgratio=4
|
||
"5" => "Wide horizontal" // &imgratio=5
|
||
]
|
||
],
|
||
"format" => [
|
||
"display" => "Format",
|
||
"option" => [
|
||
"any" => "Any format",
|
||
"3" => "JPG", // &imgformat=3
|
||
"5" => "JPEG", // &imgformat=5
|
||
"4" => "PNG", // &imgformat=4
|
||
"2" => "BMP", // &imgformat=2
|
||
"6" => "GIF (Animated)" // &imgformat=6
|
||
]
|
||
],
|
||
"color" => [
|
||
"display" => "Color",
|
||
"option" => [
|
||
"any" => "Any color",
|
||
"1024" => "White", // &ic=1024
|
||
"2048" => "Black & White",
|
||
"512" => "Black",
|
||
"64" => "Magenta",
|
||
"16" => "Blue",
|
||
"1" => "Red",
|
||
"2" => "Yellow",
|
||
"32" => "Purple",
|
||
"4" => "Green",
|
||
"8" => "Teal",
|
||
"256" => "Orange",
|
||
"128" => "Brown"
|
||
]
|
||
],
|
||
"type" => [
|
||
"display" => "Type",
|
||
"option" => [
|
||
"any" => "Any type",
|
||
"hd" => "HD", // &hd=1
|
||
"isImgSet" => "Photo album", // &isImgSet=1
|
||
"copyright" => "Copyright" // ©right=1
|
||
]
|
||
]
|
||
];
|
||
break;
|
||
|
||
case "videos":
|
||
return [];
|
||
break;
|
||
|
||
case "news":
|
||
return [
|
||
"category" => [
|
||
"display" => "Category",
|
||
"option" => [
|
||
"any" => "All news",
|
||
"media" => "Media websites", // &medium=1
|
||
"baijiahao" => "Baidu Baijiahao" // &medium=2
|
||
]
|
||
]
|
||
];
|
||
break;
|
||
}
|
||
}
|
||
|
||
private function get($proxy, $url, $get = [], $referer = false){
|
||
|
||
$curlproc = curl_init();
|
||
|
||
if($get !== []){
|
||
$get = http_build_query($get);
|
||
$url .= "?" . $get;
|
||
}
|
||
|
||
$cookies_tmp = [];
|
||
curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
|
||
|
||
$length = strlen($header);
|
||
|
||
$header = explode(":", $header, 2);
|
||
|
||
if(trim(strtolower($header[0])) == "set-cookie"){
|
||
|
||
$cookie_tmp = explode("=", trim($header[1]), 2);
|
||
|
||
$cookies_tmp[trim($cookie_tmp[0])] =
|
||
explode(";", $cookie_tmp[1], 2)[0];
|
||
}
|
||
|
||
return $length;
|
||
});
|
||
|
||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||
|
||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||
|
||
if($referer === false){
|
||
if($this->cookie === null){
|
||
|
||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||
["User-Agent: " . config::USER_AGENT,
|
||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language: en-US,en;q=0.5",
|
||
"Accept-Encoding: gzip, deflate, br, zstd",
|
||
"DNT: 1",
|
||
"Sec-GPC: 1",
|
||
"Connection: keep-alive",
|
||
"Upgrade-Insecure-Requests: 1",
|
||
"Sec-Fetch-Dest: document",
|
||
"Sec-Fetch-Mode: navigate",
|
||
"Sec-Fetch-Site: cross-site",
|
||
"Priority: u=0, i"]
|
||
);
|
||
}else{
|
||
|
||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||
["User-Agent: " . config::USER_AGENT,
|
||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language: en-US,en;q=0.5",
|
||
"Accept-Encoding: gzip, deflate, br, zstd",
|
||
"DNT: 1",
|
||
"Sec-GPC: 1",
|
||
"Connection: keep-alive",
|
||
"Cookie: {$this->cookie}",
|
||
"Upgrade-Insecure-Requests: 1",
|
||
"Sec-Fetch-Dest: document",
|
||
"Sec-Fetch-Mode: navigate",
|
||
"Sec-Fetch-Site: cross-site",
|
||
"Priority: u=0, i"]
|
||
);
|
||
}
|
||
}else{
|
||
|
||
if($this->cookie === null){
|
||
|
||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||
["User-Agent: " . config::USER_AGENT,
|
||
"Accept: application/json, text/plain, */*",
|
||
"Accept-Language: en-US,en;q=0.5",
|
||
"Accept-Encoding: gzip, deflate, br, zstd",
|
||
"Referer: {$referer}",
|
||
"DNT: 1",
|
||
"Sec-GPC: 1",
|
||
"Connection: keep-alive",
|
||
"Upgrade-Insecure-Requests: 1",
|
||
"Sec-Fetch-Dest: empty",
|
||
"Sec-Fetch-Mode: cors",
|
||
"Sec-Fetch-Site: same-origin"]
|
||
);
|
||
}else{
|
||
|
||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||
["User-Agent: " . config::USER_AGENT,
|
||
"Accept: application/json, text/plain, */*",
|
||
"Accept-Language: en-US,en;q=0.5",
|
||
"Accept-Encoding: gzip, deflate, br, zstd",
|
||
"Referer: {$referer}",
|
||
"DNT: 1",
|
||
"Sec-GPC: 1",
|
||
"Connection: keep-alive",
|
||
"Cookie: {$this->cookie}",
|
||
"Upgrade-Insecure-Requests: 1",
|
||
"Sec-Fetch-Dest: empty",
|
||
"Sec-Fetch-Mode: cors",
|
||
"Sec-Fetch-Site: same-origin"]
|
||
);
|
||
}
|
||
}
|
||
|
||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||
|
||
$this->backend->assign_proxy($curlproc, $proxy);
|
||
|
||
$data = curl_exec($curlproc);
|
||
|
||
if(curl_errno($curlproc)){
|
||
|
||
throw new Exception(curl_error($curlproc));
|
||
}
|
||
|
||
// store cookie
|
||
if(strlen($this->cookie) !== 0){
|
||
|
||
$this->cookie .= "; ";
|
||
}
|
||
|
||
foreach($cookies_tmp as $cookie_name => $cookie_value){
|
||
|
||
$this->cookie .= $cookie_name . "=" . $cookie_value . "; ";
|
||
}
|
||
|
||
$this->cookie = rtrim($this->cookie, " ;");
|
||
|
||
curl_close($curlproc);
|
||
return $data;
|
||
}
|
||
|
||
private function redirect_add_url($proxy, $url){
|
||
|
||
if(
|
||
preg_match(
|
||
'/^https?:\/\/(?:www\.)?baidu\.com\/link\?/',
|
||
$url
|
||
) === 0
|
||
){
|
||
|
||
// not a baidu redirect
|
||
return;
|
||
}
|
||
|
||
$curlproc = curl_init();
|
||
|
||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||
|
||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||
["User-Agent: " . config::USER_AGENT,
|
||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language: en-US,en;q=0.5",
|
||
"Accept-Encoding: gzip, deflate, br, zstd",
|
||
"DNT: 1",
|
||
"Sec-GPC: 1",
|
||
"Connection: keep-alive",
|
||
"Upgrade-Insecure-Requests: 1",
|
||
"Sec-Fetch-Dest: document",
|
||
"Sec-Fetch-Mode: navigate",
|
||
"Sec-Fetch-Site: none",
|
||
"Sec-Fetch-User: ?1",
|
||
"Priority: u=0, i"]
|
||
);
|
||
|
||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||
|
||
curl_setopt($curlproc, CURLOPT_HEADER, true);
|
||
curl_setopt($curlproc, CURLOPT_NOBODY, true);
|
||
|
||
$this->backend->assign_proxy($curlproc, $proxy);
|
||
|
||
curl_multi_add_handle($this->proc, $curlproc);
|
||
$this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc;
|
||
}
|
||
|
||
private function resolve_urls($proxy, &$collection, $categories){
|
||
|
||
$this->proc = curl_multi_init();
|
||
curl_multi_select($this->proc);
|
||
|
||
foreach($categories as $category){
|
||
|
||
$this->sublink_increment = 0;
|
||
$this->handle_increment = 0;
|
||
$this->handle_category = $category;
|
||
|
||
foreach($collection[$category] as $item){
|
||
|
||
$this->sublink_increment = 0;
|
||
$this->redirect_add_url($proxy, $item["url"]);
|
||
|
||
if(isset($item["sublink"])){
|
||
|
||
foreach($item["sublink"] as $sublink){
|
||
|
||
$this->sublink_increment++;
|
||
$this->redirect_add_url($proxy, $sublink["url"]);
|
||
}
|
||
}
|
||
|
||
$this->handle_increment++;
|
||
}
|
||
}
|
||
|
||
do{
|
||
$status = curl_multi_exec($this->proc, $active);
|
||
|
||
}while($active && $status == CURLM_OK);
|
||
|
||
//
|
||
// if we reach this, we're done downloading garbage
|
||
//
|
||
|
||
foreach($this->handles as $category => $v){
|
||
|
||
foreach($v as $index => $data){
|
||
|
||
foreach($this->handles[$category][$index] as $sublinkindex => $handle){
|
||
|
||
preg_match(
|
||
'/location: ?(.*)$/im',
|
||
curl_multi_getcontent($handle),
|
||
$location
|
||
);
|
||
|
||
if(isset($location[1])){
|
||
|
||
if($sublinkindex === 0){
|
||
|
||
$collection[$category][$index]["url"] = trim($location[1]);
|
||
}else{
|
||
|
||
$collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]);
|
||
}
|
||
}
|
||
|
||
curl_multi_remove_handle($this->proc, $handle);
|
||
curl_close($handle);
|
||
}
|
||
}
|
||
}
|
||
|
||
curl_multi_close($this->proc);
|
||
}
|
||
|
||
private function resolve_images($proxy, &$data){
|
||
|
||
// get the image viewer that contains all of the images direct URLs
|
||
// for some reason, getting the second image's url in the set
|
||
// doesnt trigger the captcha
|
||
|
||
if(
|
||
!isset($data["image"][1]["url"]) ||
|
||
preg_match(
|
||
'/^https:\/\/image\.baidu\.com\/search\/detail/',
|
||
$data["image"][1]["url"]
|
||
) === 0
|
||
){
|
||
|
||
// we have an already resolved image link, do nothing
|
||
return;
|
||
}
|
||
|
||
try{
|
||
|
||
$html =
|
||
$this->get(
|
||
$proxy,
|
||
$data["image"][1]["url"],
|
||
[]
|
||
);
|
||
}catch(Exception $error){
|
||
|
||
// fallback to the limited dataset we have
|
||
return;
|
||
}
|
||
|
||
$this->fuckhtml->load($html);
|
||
|
||
$script =
|
||
$this->fuckhtml
|
||
->getElementById(
|
||
"image-detail-data",
|
||
"script"
|
||
);
|
||
|
||
if($script){
|
||
|
||
$json =
|
||
json_decode(
|
||
$script["innerHTML"],
|
||
true
|
||
);
|
||
|
||
if(
|
||
!isset($json["data"]["images"]) ||
|
||
count($json["data"]["images"]) === 0
|
||
){
|
||
|
||
// do nothing
|
||
return;
|
||
}
|
||
|
||
//
|
||
// Discard all previously scraped images and use data
|
||
// from the newly downloaded image carousel
|
||
// the imageset !!should!! be the same
|
||
//
|
||
$data["image"] = [];
|
||
|
||
foreach($json["data"]["images"] as $image){
|
||
|
||
parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
|
||
|
||
$data["image"][] = [
|
||
"title" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$image["titleShow"]
|
||
),
|
||
"source" => [
|
||
[
|
||
"url" => $image["objurl"],
|
||
"width" => (int)$image["width"],
|
||
"height" => (int)$image["height"]
|
||
],
|
||
[ // thumbnail
|
||
"url" => $image["thumburl"],
|
||
"width" => (int)$thumb_size["w"],
|
||
"height" => (int)$thumb_size["h"]
|
||
]
|
||
],
|
||
"url" => $image["fromUrl"]
|
||
];
|
||
}
|
||
}
|
||
}
|
||
|
||
public function web($get){
|
||
|
||
if($get["npt"]){
|
||
|
||
[$json, $proxy] = $this->backend->get($get["npt"], "web");
|
||
|
||
$json = json_decode($json, true);
|
||
$this->cookie = $json["cookie"];
|
||
$npt_data = $json["req"];
|
||
|
||
$npt_data["pn"] = $npt_data["pn"] + 20;
|
||
|
||
try{
|
||
|
||
$html = $this->get(
|
||
$proxy,
|
||
"https://www.baidu.com/s",
|
||
$npt_data
|
||
);
|
||
}catch(Exception $error){
|
||
|
||
throw new Exception("Failed to fetch search page");
|
||
}
|
||
|
||
}else{
|
||
|
||
//
|
||
// Get authentication token
|
||
//
|
||
$proxy = $this->backend->get_ip();
|
||
|
||
// running this will give us shit in $this->cookie
|
||
// @TODO probably not needed? I get blocked anyways ffs
|
||
//$this->get($proxy, "https://www.baidu.com", []);
|
||
|
||
$npt_data = [
|
||
"wd" => $get["s"],
|
||
"rn" => 20
|
||
];
|
||
|
||
// &gpc=stf%3D0%2C1752638400|stftype%3D2
|
||
if(
|
||
$get["older"] !== false ||
|
||
$get["newer"] !== false
|
||
){
|
||
|
||
if($get["older"] === false){
|
||
|
||
$get["older"] = 0;
|
||
}
|
||
|
||
$npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2";
|
||
}
|
||
|
||
try{
|
||
|
||
$html = $this->get(
|
||
$proxy,
|
||
"https://www.baidu.com/s",
|
||
$npt_data
|
||
);
|
||
}catch(Exception $error){
|
||
|
||
throw new Exception("Failed to fetch search page");
|
||
}
|
||
|
||
$npt_data["pn"] = 0;
|
||
}
|
||
|
||
return $this->parse_search($proxy, "web", $npt_data, $html);
|
||
}
|
||
|
||
private function parse_search($proxy, $pagetype, $npt_data, $html){
|
||
|
||
// @HACK
|
||
// remove newlines from the html, cause it fucks with fuckhtml
|
||
$html = str_replace(["\n", "\r"], "", $html);
|
||
|
||
$out = [
|
||
"status" => "ok",
|
||
"spelling" => [
|
||
"type" => "no_correction",
|
||
"using" => null,
|
||
"correction" => null
|
||
],
|
||
"npt" => null,
|
||
"answer" => [],
|
||
"web" => [],
|
||
"image" => [],
|
||
"video" => [],
|
||
"news" => [],
|
||
"related" => []
|
||
];
|
||
|
||
$this->fuckhtml->load($html);
|
||
|
||
$this->detect_ass();
|
||
|
||
$datafields =
|
||
$this->fuckhtml
|
||
->getElementsByAttributeName(
|
||
"id",
|
||
"div"
|
||
);
|
||
|
||
//
|
||
// Get next page
|
||
//
|
||
$npt =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"n",
|
||
"a"
|
||
);
|
||
|
||
if(count($npt) !== 0){
|
||
|
||
$out["npt"] =
|
||
$this->backend->store(
|
||
json_encode([
|
||
"req" => $npt_data,
|
||
"cookie" => $this->cookie
|
||
]),
|
||
$pagetype,
|
||
$proxy
|
||
);
|
||
}
|
||
|
||
//
|
||
// Get related searches
|
||
//
|
||
$related_container =
|
||
$this->fuckhtml
|
||
->getElementById(
|
||
"rs_new",
|
||
$datafields
|
||
);
|
||
|
||
if($related_container){
|
||
|
||
$this->fuckhtml->load($related_container);
|
||
|
||
$as =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-color-link",
|
||
"a"
|
||
);
|
||
|
||
foreach($as as $a){
|
||
|
||
$text =
|
||
explode(
|
||
">",
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$a
|
||
),
|
||
2
|
||
);
|
||
|
||
$out["related"][] = $text[count($text) - 1];
|
||
}
|
||
}
|
||
|
||
foreach($datafields as $datafield){
|
||
|
||
if(
|
||
!isset($datafield["attributes"]["id"]) ||
|
||
preg_match(
|
||
'/^[0-9]+$/',
|
||
$datafield["attributes"]["id"]
|
||
) === 0
|
||
){
|
||
|
||
// not a search result
|
||
continue;
|
||
}
|
||
|
||
$this->fuckhtml->load($datafield);
|
||
$div =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"div"
|
||
);
|
||
|
||
//
|
||
// Don't parse as a search result if it's a card
|
||
//
|
||
$card =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"cosc-card",
|
||
$div
|
||
);
|
||
|
||
if(count($card) !== 0){
|
||
|
||
//
|
||
// Parse chinese youtube shorts
|
||
//
|
||
$ytshorts_probe =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"tts-b-item",
|
||
$div
|
||
);
|
||
|
||
if(count($ytshorts_probe) !== 0){
|
||
|
||
$videos =
|
||
$this->fuckhtml
|
||
->getElementsByAttributeValue(
|
||
"data-show",
|
||
"list",
|
||
$div
|
||
);
|
||
|
||
foreach($videos as $video){
|
||
|
||
$this->fuckhtml->load($video);
|
||
|
||
$title =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"cosc-title-slot",
|
||
"span"
|
||
);
|
||
|
||
if(count($title) === 0){
|
||
|
||
continue;
|
||
}
|
||
|
||
$url =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"a"
|
||
);
|
||
|
||
if(count($url) === 0){
|
||
|
||
continue;
|
||
}
|
||
|
||
$image =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"cos-image-body",
|
||
"img"
|
||
);
|
||
|
||
if(count($image) === 0){
|
||
|
||
$image = [
|
||
"ratio" => null,
|
||
"url" => null
|
||
];
|
||
}else{
|
||
|
||
$image = [
|
||
"ratio" => "1:1",
|
||
"url" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$image[0]["attributes"]["src"]
|
||
)
|
||
];
|
||
}
|
||
|
||
// get duration
|
||
$divs =
|
||
$this->fuckhtml
|
||
->getElementsByAttributeName(
|
||
"class",
|
||
"div"
|
||
);
|
||
|
||
$duration = null;
|
||
foreach($divs as $probe){
|
||
|
||
if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){
|
||
|
||
$duration =
|
||
$this->hms2int(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$probe
|
||
)
|
||
);
|
||
break;
|
||
}
|
||
}
|
||
|
||
$out["video"][] = [
|
||
"title" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$title[0]
|
||
),
|
||
"description" => null,
|
||
"date" => null,
|
||
"duration" => $duration,
|
||
"views" => null,
|
||
"thumb" => $image,
|
||
"url" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$url[0]["attributes"]["href"]
|
||
)
|
||
];
|
||
}
|
||
}
|
||
|
||
//
|
||
// Parse image carousel
|
||
//
|
||
$is_image_carousel = false;
|
||
foreach($div as $d){
|
||
|
||
if(
|
||
isset($d["attributes"]["class"]) &&
|
||
strpos($d["attributes"]["class"], "image-container") !== false
|
||
){
|
||
|
||
$is_image_carousel = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if($is_image_carousel){
|
||
|
||
preg_match(
|
||
'/<!--s-data:([\S\s]*)-->/U',
|
||
$datafield["innerHTML"],
|
||
$matches
|
||
);
|
||
|
||
if(isset($matches[1])){
|
||
|
||
// weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong
|
||
$json =
|
||
$this->fuckhtml
|
||
->parseJsObject(
|
||
str_replace(
|
||
"-\-",
|
||
"--",
|
||
$matches[1]
|
||
)
|
||
);
|
||
|
||
if(
|
||
$json !== null &&
|
||
isset($json["imageList"][0]["images"])
|
||
){
|
||
|
||
// parse image carousel
|
||
foreach($json["imageList"][0]["images"] as $image){
|
||
|
||
parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
|
||
|
||
$out["image"][] = [
|
||
"title" => "image",
|
||
"source" => [
|
||
[
|
||
"url" => $image["objurl"],
|
||
"width" => (int)$image["width"],
|
||
"height" => (int)$image["height"]
|
||
],
|
||
[ // thumbnail
|
||
"url" => $image["thumburl"],
|
||
"width" => (int)$thumb_size["w"],
|
||
"height" => (int)$thumb_size["h"]
|
||
]
|
||
],
|
||
"url" => $image["jumpUrl"]
|
||
];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if(!isset($datafield["attributes"]["mu"])){
|
||
|
||
// dont scrape if we dont have the direct link
|
||
continue;
|
||
}
|
||
|
||
// class:FYB_RD -> News garbage, IGNORE
|
||
|
||
$result =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"result",
|
||
[$datafield]
|
||
);
|
||
|
||
if(count($result) !== 0){
|
||
|
||
//
|
||
// Parse normal search result
|
||
//
|
||
|
||
$title =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"sc-link",
|
||
"a"
|
||
);
|
||
|
||
if(count($title) === 0){
|
||
|
||
// should not happen
|
||
continue;
|
||
}
|
||
|
||
$title =
|
||
$this->titledots(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$title[0]
|
||
)
|
||
);
|
||
|
||
$description =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-color",
|
||
$div
|
||
);
|
||
|
||
if(count($description) !== 0){
|
||
|
||
$this->fuckhtml->load($description[0]);
|
||
|
||
$description =
|
||
$this->fuckhtml
|
||
->getElementsByAttributeName(
|
||
"class",
|
||
"span"
|
||
);
|
||
|
||
$found_desc = false;
|
||
foreach($description as $desc){
|
||
|
||
if(stripos($desc["attributes"]["class"], "summary-text") !== false){
|
||
|
||
$found_desc = true;
|
||
$description =
|
||
$this->titledots(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$desc
|
||
)
|
||
);
|
||
break;
|
||
}
|
||
}
|
||
|
||
if($found_desc === false){
|
||
|
||
$description = null;
|
||
}
|
||
|
||
$this->fuckhtml->load($datafield);
|
||
}else{
|
||
|
||
$description = null;
|
||
}
|
||
|
||
// parse date
|
||
$date_probe =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"cos-color-text-minor",
|
||
"span"
|
||
);
|
||
|
||
if(count($date_probe) !== 0){
|
||
|
||
$date =
|
||
$this->parse_time(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$date_probe[0]
|
||
)
|
||
);
|
||
}else{
|
||
|
||
$date = null;
|
||
}
|
||
|
||
// parse image
|
||
$img =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"img"
|
||
);
|
||
|
||
if(count($img) !== 0){
|
||
|
||
$image = [
|
||
"ratio" => "16:9",
|
||
"url" =>
|
||
$this->unfuckthumb(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$img[0]["attributes"]["src"]
|
||
)
|
||
)
|
||
];
|
||
}else{
|
||
|
||
$image = [
|
||
"ratio" => null,
|
||
"url" => null
|
||
];
|
||
}
|
||
|
||
// get page type
|
||
$pagetype_probe =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"b"
|
||
);
|
||
|
||
$pagetype = "web";
|
||
foreach($pagetype_probe as $probe){
|
||
|
||
$pagetype =
|
||
strtolower(
|
||
trim(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$probe
|
||
),
|
||
" 【】"
|
||
)
|
||
);
|
||
}
|
||
|
||
// get extra links
|
||
$sublinks = [];
|
||
|
||
foreach($div as $d){
|
||
|
||
if(
|
||
isset($d["attributes"]["class"]) &&
|
||
strpos($d["attributes"]["class"], "exta-link") !== false
|
||
){
|
||
|
||
$this->fuckhtml->load($d);
|
||
|
||
$links =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"cos-space-mt-xs",
|
||
"div"
|
||
);
|
||
|
||
foreach($links as $link){
|
||
|
||
$this->fuckhtml->load($link);
|
||
$s_title =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"h3"
|
||
);
|
||
|
||
if(count($s_title) === 0){
|
||
|
||
// should not happen
|
||
continue;
|
||
}
|
||
|
||
$data2 =
|
||
json_decode(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$s_title[0]["attributes"]["data-click"]
|
||
),
|
||
true
|
||
);
|
||
|
||
if(!isset($data2["clk_info"])){
|
||
|
||
// wtf
|
||
continue;
|
||
}
|
||
|
||
$data2 =
|
||
json_decode(
|
||
$data2["clk_info"],
|
||
true
|
||
);
|
||
|
||
if(!isset($data2["url"])){
|
||
|
||
// no link, fuck off
|
||
continue;
|
||
}
|
||
|
||
$url =
|
||
rawurldecode(
|
||
$data2["url"]
|
||
);
|
||
|
||
$data =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"p"
|
||
);
|
||
|
||
$s_description = null;
|
||
|
||
if(count($data) !== 0){
|
||
|
||
$data =
|
||
json_decode(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$data[0]["attributes"]["sub-show-log"]
|
||
),
|
||
true
|
||
);
|
||
|
||
if(isset($data["ext"]["content"])){
|
||
|
||
$s_description = $data["ext"]["content"];
|
||
}
|
||
}
|
||
|
||
$sublinks[] = [
|
||
"title" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$s_title[0]
|
||
),
|
||
"description" => $s_description,
|
||
"url" => $url,
|
||
"date" => null
|
||
];
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
|
||
$out["web"][] = [
|
||
"title" => $title,
|
||
"description" => $description,
|
||
"url" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$datafield["attributes"]["mu"]
|
||
),
|
||
"date" => $date,
|
||
"type" => $pagetype,
|
||
"thumb" => $image,
|
||
"sublink" => $sublinks,
|
||
"table" => []
|
||
];
|
||
|
||
continue;
|
||
}
|
||
|
||
// parse special result
|
||
$result =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"result-op",
|
||
[$datafield]
|
||
);
|
||
|
||
if(count($result) !== 0){
|
||
|
||
//
|
||
// Parse video carousel
|
||
//
|
||
if(
|
||
isset($datafield["attributes"]["tpl"]) &&
|
||
stripos($datafield["attributes"]["tpl"], "video") !== false
|
||
){
|
||
|
||
preg_match(
|
||
'/<!--s-data:([\S\s]*)-->/U',
|
||
$datafield["innerHTML"],
|
||
$matches
|
||
);
|
||
|
||
if(isset($matches[1])){
|
||
|
||
$json =
|
||
json_decode(
|
||
$matches[1],
|
||
true
|
||
);
|
||
|
||
if($json !== null){
|
||
|
||
foreach($json["videoList"] as $video){
|
||
|
||
$out["video"][] = [
|
||
"title" => $video["title"],
|
||
"description" =>
|
||
$this->titledots(
|
||
$video["desc"]
|
||
),
|
||
"date" =>
|
||
$this->parse_time(
|
||
$video["pubTime"]
|
||
),
|
||
"duration" =>
|
||
$this->hms2int(
|
||
$video["duration"]
|
||
),
|
||
"views" =>
|
||
$this->parse_viewcount(
|
||
$video["playCount"]
|
||
),
|
||
"thumb" => [
|
||
"ratio" => "16:9",
|
||
"url" => $video["poster"]
|
||
],
|
||
"url" => $video["bindProps"]["link"]
|
||
];
|
||
}
|
||
}
|
||
}
|
||
continue;
|
||
}
|
||
|
||
//
|
||
// Special result div (wiki entries, rich divs)
|
||
//
|
||
$title =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"h3"
|
||
);
|
||
|
||
if(count($title) === 0){
|
||
|
||
// should have a title somewhere
|
||
continue;
|
||
}
|
||
|
||
$title =
|
||
explode(
|
||
">",
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$title[0]
|
||
),
|
||
2
|
||
);
|
||
|
||
if(count($title) === 2){
|
||
|
||
$title = $title[1];
|
||
}else{
|
||
|
||
$title = $title[0];
|
||
}
|
||
|
||
// probe for wiki-like entry
|
||
$description =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"sc-paragraph",
|
||
"p"
|
||
);
|
||
|
||
if(count($description) === 0){
|
||
|
||
// try and get grey description
|
||
$description =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-color-gray2",
|
||
"p"
|
||
);
|
||
|
||
if(count($description) === 0){
|
||
|
||
// probe for special social media description
|
||
$description =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-color-text",
|
||
"div"
|
||
);
|
||
|
||
if(isset($description[0]["attributes"]["aria-label"])){
|
||
|
||
$description =
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$description[0]
|
||
["attributes"]
|
||
["aria-label"]
|
||
);
|
||
}else{
|
||
|
||
// check for news tab description
|
||
$span =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-font-normal",
|
||
"span"
|
||
);
|
||
|
||
$description = null;
|
||
|
||
foreach($span as $s){
|
||
|
||
if(isset($s["attributes"]["aria-label"])){
|
||
|
||
$description =
|
||
$this->titledots(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$span[count($span) - 1]
|
||
)
|
||
);
|
||
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}else{
|
||
|
||
$description =
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$description[0]
|
||
);
|
||
}
|
||
|
||
}else{
|
||
|
||
preg_match(
|
||
'/<!--s-text-->([\S\s]*)<!--\/s-text-->/U',
|
||
$description[count($description) - 1]["innerHTML"],
|
||
$matches
|
||
);
|
||
|
||
if(isset($matches[1])){
|
||
|
||
$description =
|
||
$this->titledots(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$matches[1]
|
||
)
|
||
);
|
||
}else{
|
||
|
||
$description = null;
|
||
}
|
||
}
|
||
|
||
// get thumbnail
|
||
$thumb =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"img"
|
||
);
|
||
|
||
if(count($thumb) !== 0){
|
||
|
||
$thumb = [
|
||
"ratio" => "1:1",
|
||
"url" =>
|
||
$this->unfuckthumb(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$thumb[0]["attributes"]["src"]
|
||
)
|
||
)
|
||
];
|
||
}else{
|
||
|
||
$thumb = [
|
||
"ratio" => null,
|
||
"url" => null
|
||
];
|
||
}
|
||
|
||
// get sublinks
|
||
preg_match(
|
||
'/<!--s-data:([\S\s]*)-->/U',
|
||
$datafield["innerHTML"],
|
||
$matches
|
||
);
|
||
|
||
$sublinks = [];
|
||
|
||
if(isset($matches[1])){
|
||
|
||
$json =
|
||
json_decode(
|
||
$matches[1],
|
||
true
|
||
);
|
||
|
||
if($json !== null){
|
||
|
||
if(isset($json["buttons"])){
|
||
|
||
foreach($json["buttons"] as $button){
|
||
|
||
$sublinks[] = [
|
||
"title" => $button["text"],
|
||
"description" => null,
|
||
"date" => null,
|
||
"url" => $button["url"]
|
||
];
|
||
}
|
||
}elseif(isset($json["mthreadList"])){
|
||
|
||
foreach($json["mthreadList"] as $thread){
|
||
|
||
$sublinks[] = [
|
||
"title" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$thread["title"]
|
||
),
|
||
"description" => null,
|
||
"date" => null,
|
||
"url" => $thread["ttsInfo"]["titleUrl"]
|
||
];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// get URL
|
||
// handle http://fakeurl.baidu.com bullshit
|
||
$url =
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$datafield["attributes"]["mu"]
|
||
);
|
||
|
||
if(
|
||
preg_match(
|
||
'/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/',
|
||
$url
|
||
)
|
||
){
|
||
|
||
// we got some bullshit, get jumpUrl instead
|
||
$as =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"a"
|
||
);
|
||
|
||
if(count($as) !== 0){
|
||
|
||
$url =
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$as[0]["attributes"]["href"]
|
||
);
|
||
}
|
||
}
|
||
|
||
// get xueshu sublinks
|
||
// get list
|
||
$xueshu_list =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"op-xueshu-links-d20-list",
|
||
$div
|
||
);
|
||
|
||
if(count($xueshu_list) !== 0){
|
||
|
||
$this->fuckhtml->load($xueshu_list[0]);
|
||
|
||
$rows =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-row",
|
||
"div"
|
||
);
|
||
|
||
// remove "read more" bullshit
|
||
foreach($rows as $row){
|
||
|
||
if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
|
||
|
||
$xueshu_list[0]["innerHTML"] =
|
||
str_replace(
|
||
$row["outerHTML"],
|
||
"",
|
||
$xueshu_list[0]["innerHTML"]
|
||
);
|
||
}
|
||
}
|
||
|
||
$this->fuckhtml->load($xueshu_list[0]);
|
||
|
||
foreach($rows as $row){
|
||
|
||
$this->fuckhtml->load($row);
|
||
|
||
if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
|
||
|
||
continue;
|
||
}
|
||
|
||
$as =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"a"
|
||
);
|
||
|
||
foreach($as as $a){
|
||
|
||
$sublinks[] = [
|
||
"title" =>
|
||
$this->titledots(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$a
|
||
)
|
||
),
|
||
"description" => null,
|
||
"date" => null,
|
||
"url" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$a["attributes"]["href"]
|
||
)
|
||
];
|
||
}
|
||
}
|
||
}
|
||
|
||
$out["web"][] = [
|
||
"title" => $title,
|
||
"description" => $description,
|
||
"url" => $url,
|
||
"date" => null,
|
||
"type" => "web",
|
||
"thumb" => $thumb,
|
||
"sublink" => $sublinks,
|
||
"table" => []
|
||
];
|
||
continue;
|
||
}
|
||
}
|
||
|
||
//
|
||
// Remove tracking URLs and fetch additonal image resources
|
||
//
|
||
$this->resolve_urls($proxy, $out, ["web", "video"]);
|
||
$this->resolve_images($proxy, $out);
|
||
|
||
return $out;
|
||
}
|
||
|
||
public function image($get){
|
||
|
||
// https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1
|
||
//$json = file_get_contents("scraper/baidu_img.json");
|
||
|
||
if($get["npt"]){
|
||
|
||
[$params, $proxy] = $this->backend->get($get["npt"], "images");
|
||
$params = json_decode($params, true);
|
||
|
||
$params["pn"] = $params["pn"] + 60;
|
||
|
||
}else{
|
||
|
||
$proxy = $this->backend->get_ip();
|
||
$params = [
|
||
"word" => $get["s"],
|
||
"rn" => 60, // results/page
|
||
"pn" => 0, // item increment (0 * 60)
|
||
"newReq" => 1 // otherwise json is fucked up
|
||
];
|
||
|
||
switch($get["sort"]){
|
||
|
||
case "latest": $params["latest"] = 1; break;
|
||
case "hot": $params["hot"] = 1; break;
|
||
}
|
||
|
||
if($get["size"] != "any"){
|
||
|
||
$params["z"] = $get["size"];
|
||
}
|
||
|
||
if($get["ratio"] != "any"){
|
||
|
||
$params["imgratio"] = $get["ratio"];
|
||
}
|
||
|
||
if($get["format"] != "any"){
|
||
|
||
$params["imgformat"] = $get["format"];
|
||
}
|
||
|
||
if($get["color"] != "any"){
|
||
|
||
$params["ic"] = $get["color"];
|
||
}
|
||
|
||
switch($get["type"]){
|
||
|
||
case "hd": $params["hd"] = 1; break;
|
||
case "isImgSet": $params["isImgSet"] = 1; break;
|
||
case "copyright": $params["copyright"] = 1; break;
|
||
}
|
||
}
|
||
|
||
try{
|
||
|
||
$json =
|
||
$this->get(
|
||
$proxy,
|
||
"https://image.baidu.com/search/acjson",
|
||
$params,
|
||
"https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"])
|
||
);
|
||
}catch(Exception $error){
|
||
|
||
throw new Exception("Failed to fetch JSON");
|
||
}
|
||
|
||
$json = json_decode($json, true);
|
||
|
||
if($json === null){
|
||
|
||
// detect captcha first
|
||
$this->fuckhtml->load($json);
|
||
$this->detect_ass();
|
||
|
||
// fallback to json decode error
|
||
throw new Exception("Failed to decode JSON");
|
||
}
|
||
|
||
if(
|
||
isset($json["message"]) &&
|
||
$json["message"] != "success"
|
||
){
|
||
|
||
throw new Exception("Baidu returned an error: {$json["message"]}");
|
||
}
|
||
|
||
if(!isset($json["data"]["images"])){
|
||
|
||
throw new Exception("Baidu did not return an image object");
|
||
}
|
||
|
||
$out = [
|
||
"status" => "ok",
|
||
"npt" => null,
|
||
"image" => []
|
||
];
|
||
|
||
foreach($json["data"]["images"] as $image){
|
||
|
||
parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
|
||
|
||
$out["image"][] = [
|
||
"title" =>
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$image["titleShow"]
|
||
),
|
||
"source" => [
|
||
[
|
||
"url" => $image["objurl"],
|
||
"width" => (int)$image["width"],
|
||
"height" => (int)$image["height"]
|
||
],
|
||
[ // thumbnail
|
||
"url" => $image["thumburl"],
|
||
"width" => (int)$thumb_size["w"],
|
||
"height" => (int)$thumb_size["h"]
|
||
]
|
||
],
|
||
"url" => $image["fromUrl"]
|
||
];
|
||
}
|
||
|
||
//
|
||
// Detect if there's a next page
|
||
//
|
||
if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){
|
||
|
||
$out["npt"] =
|
||
$this->backend->store(
|
||
json_encode($params),
|
||
"images",
|
||
$proxy
|
||
);
|
||
}
|
||
|
||
return $out;
|
||
}
|
||
|
||
public function video($get){
|
||
|
||
// https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0
|
||
// increase &pn +20 for pagination
|
||
|
||
//$html = file_get_contents("scraper/baidu_vid.html");
|
||
|
||
if($get["npt"]){
|
||
|
||
[$params, $proxy] = $this->backend->get($get["npt"], "videos");
|
||
$params = json_decode($params, true);
|
||
|
||
$params["pn"] = $params["pn"] + 10;
|
||
}else{
|
||
|
||
$proxy = $this->backend->get_ip();
|
||
$params = [
|
||
"pd" => "video",
|
||
"tn" => "vsearch",
|
||
"wd" => $get["s"],
|
||
"async" => 1,
|
||
"pn" => 0
|
||
];
|
||
}
|
||
|
||
try{
|
||
$html =
|
||
$this->get(
|
||
$proxy,
|
||
"https://www.baidu.com/sf/vsearch",
|
||
$params
|
||
);
|
||
}catch(Exception $error){
|
||
|
||
throw new Exception("Failed to get search page");
|
||
}
|
||
|
||
$html =
|
||
str_replace(
|
||
["\r", "\n"],
|
||
"",
|
||
$html
|
||
);
|
||
|
||
$out = [
|
||
"status" => "ok",
|
||
"npt" => null,
|
||
"video" => [],
|
||
"author" => [],
|
||
"livestream" => [],
|
||
"playlist" => [],
|
||
"reel" => []
|
||
];
|
||
|
||
$html = explode("<script>", $html);
|
||
|
||
foreach($html as $result){
|
||
|
||
$result = trim($result);
|
||
|
||
$this->fuckhtml->load($result);
|
||
|
||
// get URL
|
||
preg_match(
|
||
'/<!-- *([^ ]*) *-->/',
|
||
$result,
|
||
$matches
|
||
);
|
||
|
||
if(!isset($matches[1])){
|
||
|
||
// no link, give up
|
||
continue;
|
||
}
|
||
|
||
$link = $matches[1];
|
||
|
||
// get title
|
||
$title =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"video-title",
|
||
"a"
|
||
);
|
||
|
||
if(count($title) === 0){
|
||
|
||
// should not happen
|
||
continue;
|
||
}
|
||
|
||
$title =
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$title[0]
|
||
);
|
||
|
||
// get thumbnail
|
||
$img =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"border-radius",
|
||
"img"
|
||
);
|
||
|
||
if(count($img) !== 0){
|
||
|
||
$thumb = [
|
||
"url" =>
|
||
$this->unfuckthumb(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$img[0]["attributes"]["src"]
|
||
)
|
||
),
|
||
"ratio" => "16:9"
|
||
];
|
||
}else{
|
||
|
||
$thumb = [
|
||
"url" => null,
|
||
"ratio" => null
|
||
];
|
||
}
|
||
|
||
$span =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"span"
|
||
);
|
||
|
||
// get duration
|
||
$duration =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"video_play_timer",
|
||
$span
|
||
);
|
||
|
||
if(count($duration) !== 0){
|
||
|
||
$duration =
|
||
$this->hms2int(
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$duration[0]
|
||
)
|
||
);
|
||
}else{
|
||
|
||
$duration = null;
|
||
}
|
||
|
||
// get author
|
||
// 来源:哔哩哔哩
|
||
$author =
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"wetSource",
|
||
$span
|
||
);
|
||
|
||
if(count($author) !== 0){
|
||
|
||
$author =
|
||
explode(
|
||
":",
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$author[0]
|
||
),
|
||
2
|
||
)[1];
|
||
}else{
|
||
|
||
$author = null;
|
||
}
|
||
|
||
// get date posted
|
||
//发布时间:2024-05-06
|
||
|
||
// AND get description
|
||
// 简介:Our first look
|
||
$infospans =
|
||
array_merge(
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-font-normal",
|
||
$span
|
||
),
|
||
$this->fuckhtml
|
||
->getElementsByClassName(
|
||
"c-font-normal",
|
||
"div"
|
||
)
|
||
);
|
||
|
||
$date = null;
|
||
$description = null;
|
||
|
||
foreach($infospans as $infospan){
|
||
|
||
$infospan =
|
||
explode(
|
||
":",
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$infospan
|
||
),
|
||
2
|
||
);
|
||
|
||
if(count($infospan) !== 2){
|
||
|
||
// should not happen
|
||
continue;
|
||
}
|
||
|
||
$infospan[1] =
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$infospan[1]
|
||
);
|
||
|
||
switch($infospan[0]){
|
||
|
||
case "发布时间": // date posted
|
||
$date = $this->parse_time($infospan[1]);
|
||
break;
|
||
|
||
case "简介": // description
|
||
$description = $infospan[1];
|
||
break;
|
||
}
|
||
}
|
||
|
||
$out["video"][] = [
|
||
"title" => $this->titledots($title),
|
||
"description" => $this->titledots($description),
|
||
"author" => [
|
||
"name" => $author,
|
||
"url" => null,
|
||
"avatar" => null
|
||
],
|
||
"date" => $date,
|
||
"duration" => $duration,
|
||
"views" => null,
|
||
"thumb" => $thumb,
|
||
"url" => $link
|
||
];
|
||
}
|
||
|
||
if(count($out["video"]) === 10){
|
||
|
||
// assume there's another page after this
|
||
$out["npt"] =
|
||
$this->backend->store(
|
||
json_encode($params),
|
||
"videos",
|
||
$proxy
|
||
);
|
||
}
|
||
|
||
return $out;
|
||
}
|
||
|
||
public function news($get){
|
||
|
||
//$proxy = $this->backend->get_ip();
|
||
//$html = file_get_contents("scraper/baidu.html");
|
||
//$npt_data = [];
|
||
|
||
if($get["npt"]){
|
||
|
||
[$json, $proxy] = $this->backend->get($get["npt"], "news");
|
||
|
||
$json = json_decode($json, true);
|
||
$this->cookie = $json["cookie"];
|
||
$npt_data = $json["req"];
|
||
|
||
$npt_data["pn"] = $npt_data["pn"] + 20;
|
||
|
||
try{
|
||
|
||
$html = $this->get(
|
||
$proxy,
|
||
"https://www.baidu.com/s",
|
||
$npt_data
|
||
);
|
||
}catch(Exception $error){
|
||
|
||
throw new Exception("Failed to fetch search page");
|
||
}
|
||
|
||
}else{
|
||
|
||
//
|
||
// Get authentication token
|
||
//
|
||
$proxy = $this->backend->get_ip();
|
||
|
||
$npt_data = [
|
||
"wd" => $get["s"],
|
||
"rn" => 20,
|
||
"tn" => "news"
|
||
];
|
||
|
||
// @TODO add filters
|
||
|
||
try{
|
||
|
||
$html = $this->get(
|
||
$proxy,
|
||
"https://www.baidu.com/s",
|
||
$npt_data
|
||
);
|
||
}catch(Exception $error){
|
||
|
||
throw new Exception("Failed to fetch search page");
|
||
}
|
||
|
||
$npt_data["pn"] = 0;
|
||
}
|
||
|
||
$data = $this->parse_search($proxy, "news", $npt_data, $html);
|
||
|
||
$out = [
|
||
"status" => "ok",
|
||
"npt" => $data["npt"],
|
||
"news" => []
|
||
];
|
||
|
||
foreach($data["web"] as $article){
|
||
|
||
$out["news"][] = [
|
||
"title" => $article["title"],
|
||
"author" => null,
|
||
"description" => $article["description"],
|
||
"date" => $article["date"],
|
||
"thumb" => [
|
||
"url" => $article["thumb"]["url"],
|
||
"ratio" => $article["thumb"]["url"] !== null ? "16:9" : null,
|
||
],
|
||
"url" => $article["url"]
|
||
];
|
||
}
|
||
|
||
return $out;
|
||
}
|
||
|
||
private function unfuckthumb($url){
|
||
|
||
// probe for proxy URL
|
||
$parsed_url = parse_url($url);
|
||
if(
|
||
preg_match(
|
||
'/^https?:\/\/gimg(?:[0-9]+)?\.baidu\.com/',
|
||
$url
|
||
)
|
||
){
|
||
|
||
$parts = explode("src=", $url);
|
||
if(count($parts) !== 2){
|
||
|
||
// shits fucked
|
||
return $url;
|
||
}
|
||
|
||
return urldecode(explode("&", $parts[1])[0]);
|
||
}
|
||
|
||
$q = explode("&", $url, 2);
|
||
|
||
if(count($q) !== 2){
|
||
|
||
// shits fucked, again
|
||
return $url;
|
||
}
|
||
|
||
// baidu devs are fucking retarded and dont follow spec:
|
||
// &fmt=auto?s=BB32F3A050471AEC72886934030090C4&sec=1753203600&t=0fb2194775d3bd3d1bb114b818479e0a
|
||
parse_str(str_replace("?", "&", $q[1]), $query);
|
||
|
||
if(isset($query["size"])){ unset($query["size"]); }
|
||
if(isset($query["q"])){ $query["q"] = "90"; }
|
||
|
||
$query = http_build_query($query);
|
||
|
||
return
|
||
str_replace(
|
||
$q[1],
|
||
$query,
|
||
$url
|
||
);
|
||
}
|
||
|
||
private function titledots($title){
|
||
|
||
return trim($title, " .\t\n\r\0\x0B…");
|
||
}
|
||
|
||
private function hms2int($time){
|
||
|
||
$parts = explode(":", $time, 3);
|
||
$time = 0;
|
||
|
||
if(count($parts) === 3){
|
||
|
||
// hours
|
||
$time = $time + ((int)$parts[0] * 3600);
|
||
array_shift($parts);
|
||
}
|
||
|
||
if(count($parts) === 2){
|
||
|
||
// minutes
|
||
$time = $time + ((int)$parts[0] * 60);
|
||
array_shift($parts);
|
||
}
|
||
|
||
// seconds
|
||
$time = $time + (int)$parts[0];
|
||
|
||
return $time;
|
||
}
|
||
|
||
private function parse_viewcount($views){
|
||
|
||
if(
|
||
// 10k (wtf lol)
|
||
preg_match(
|
||
'/([0-9]+)万次/',
|
||
$views,
|
||
$matches
|
||
)
|
||
){
|
||
|
||
return (int)$matches[1] * 10000;
|
||
}
|
||
|
||
if(
|
||
// units
|
||
preg_match(
|
||
'/([0-9]+)次/',
|
||
$views,
|
||
$matches
|
||
)
|
||
){
|
||
|
||
return (int)$matches[1];
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
private function parse_time($time){
|
||
|
||
// 2023年8月7日 => yyyy/m/d
|
||
if(
|
||
preg_match(
|
||
'/([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日/',
|
||
$time,
|
||
$matches
|
||
)
|
||
){
|
||
|
||
return strtotime("{$matches[1]}/{$matches[2]}/{$matches[3]}");
|
||
}
|
||
|
||
// 昨天11:45 => yesterday at 11:45
|
||
// 昨天 => yesterday
|
||
if(
|
||
preg_match(
|
||
'/昨天(.*)/',
|
||
$time,
|
||
$matches
|
||
)
|
||
){
|
||
|
||
return strtotime("Yesterday {$matches[1]}");
|
||
}
|
||
|
||
// 3天前 => 3 days ago
|
||
if(
|
||
preg_match(
|
||
'/([0-9]{1,4})天前/',
|
||
$time,
|
||
$matches
|
||
)
|
||
){
|
||
|
||
return strtotime("{$matches[1]} days ago");
|
||
}
|
||
|
||
// 1个月前 => 1 month ago
|
||
if(
|
||
preg_match(
|
||
'/([0-9]{1,4})个月前/',
|
||
$time,
|
||
$matches
|
||
)
|
||
){
|
||
|
||
return strtotime("{$matches[1]} months ago");
|
||
}
|
||
|
||
// attempt to parse as-is
|
||
$time = strtotime($time);
|
||
|
||
if($time !== false){
|
||
|
||
return $time;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
private function detect_ass(){
|
||
|
||
$as =
|
||
$this->fuckhtml
|
||
->getElementsByTagName(
|
||
"a"
|
||
);
|
||
|
||
if(
|
||
count($as) === 0 ||
|
||
preg_match(
|
||
'/^https?:\/\/wappass\.baidu\.com\/static\/captcha/',
|
||
$this->fuckhtml
|
||
->getTextContent(
|
||
$as[0]["attributes"]["href"]
|
||
)
|
||
)
|
||
){
|
||
|
||
throw new Exception("Baidu returned a Captcha");
|
||
}
|
||
}
|
||
}
|