4get/scraper/baidu.php

2230 lines
42 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
class baidu{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("baidu");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
$this->handles = [];
$this->proc = null;
$this->handle_category = null;
$this->handle_increment = 0;
$this->sublink_increment = 0;
$this->cookie = null;
}
public function getfilters($page){
switch($page){
case "web":
return
[
"newer" => [
"display" => "Newer than",
"option" => "_DATE"
],
"older" => [
"display" => "Older than",
"option" => "_DATE"
]
];
break;
case "images":
return
[
"sort" => [
"display" => "Sort",
"option" => [
"relevance" => "Relevance", // no param
"latest" => "Latest", // &latest=1
"hot" => "Hot" // &hot=1
]
],
"size" => [
"display" => "Size",
"option" => [
"any" => "Any size",
"7" => "Extra large (1080px+)", // &z=7
"6" => "Large (600px~1080px)", // &z=6
"5" => "Medium (300px~600px)", // &z=5
"4" => "Small (1px~300px)" // &z=4
]
],
"ratio" => [
"display" => "Ratio",
"option" => [
"any" => "Any ratio",
"1" => "Tall vertical", // &imgratio=1
"2" => "Vertical", // &imgratio=2
"3" => "Square", // &imgratio=3
"4" => "Horizontal", // &imgratio=4
"5" => "Wide horizontal" // &imgratio=5
]
],
"format" => [
"display" => "Format",
"option" => [
"any" => "Any format",
"3" => "JPG", // &imgformat=3
"5" => "JPEG", // &imgformat=5
"4" => "PNG", // &imgformat=4
"2" => "BMP", // &imgformat=2
"6" => "GIF (Animated)" // &imgformat=6
]
],
"color" => [
"display" => "Color",
"option" => [
"any" => "Any color",
"1024" => "White", // &ic=1024
"2048" => "Black & White",
"512" => "Black",
"64" => "Magenta",
"16" => "Blue",
"1" => "Red",
"2" => "Yellow",
"32" => "Purple",
"4" => "Green",
"8" => "Teal",
"256" => "Orange",
"128" => "Brown"
]
],
"type" => [
"display" => "Type",
"option" => [
"any" => "Any type",
"hd" => "HD", // &hd=1
"isImgSet" => "Photo album", // &isImgSet=1
"copyright" => "Copyright" // &copyright=1
]
]
];
break;
case "videos":
return [];
break;
case "news":
return [
"category" => [
"display" => "Category",
"option" => [
"any" => "All news",
"media" => "Media websites", // &medium=1
"baijiahao" => "Baidu Baijiahao" // &medium=2
]
]
];
break;
}
}
private function get($proxy, $url, $get = [], $referer = false){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
$cookies_tmp = [];
curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
$length = strlen($header);
$header = explode(":", $header, 2);
if(trim(strtolower($header[0])) == "set-cookie"){
$cookie_tmp = explode("=", trim($header[1]), 2);
$cookies_tmp[trim($cookie_tmp[0])] =
explode(";", $cookie_tmp[1], 2)[0];
}
return $length;
});
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($referer === false){
if($this->cookie === null){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: cross-site",
"Priority: u=0, i"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Cookie: {$this->cookie}",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: cross-site",
"Priority: u=0, i"]
);
}
}else{
if($this->cookie === null){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/plain, */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"Referer: {$referer}",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/plain, */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"Referer: {$referer}",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Cookie: {$this->cookie}",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin"]
);
}
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
// store cookie
if(strlen($this->cookie) !== 0){
$this->cookie .= "; ";
}
foreach($cookies_tmp as $cookie_name => $cookie_value){
$this->cookie .= $cookie_name . "=" . $cookie_value . "; ";
}
$this->cookie = rtrim($this->cookie, " ;");
curl_close($curlproc);
return $data;
}
private function redirect_add_url($proxy, $url){
if(
preg_match(
'/^https?:\/\/(?:www\.)?baidu\.com\/link\?/',
$url
) === 0
){
// not a baidu redirect
return;
}
$curlproc = curl_init();
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1",
"Priority: u=0, i"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_HEADER, true);
curl_setopt($curlproc, CURLOPT_NOBODY, true);
$this->backend->assign_proxy($curlproc, $proxy);
curl_multi_add_handle($this->proc, $curlproc);
$this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc;
}
private function resolve_urls($proxy, &$collection, $categories){
$this->proc = curl_multi_init();
curl_multi_select($this->proc);
foreach($categories as $category){
$this->sublink_increment = 0;
$this->handle_increment = 0;
$this->handle_category = $category;
foreach($collection[$category] as $item){
$this->sublink_increment = 0;
$this->redirect_add_url($proxy, $item["url"]);
if(isset($item["sublink"])){
foreach($item["sublink"] as $sublink){
$this->sublink_increment++;
$this->redirect_add_url($proxy, $sublink["url"]);
}
}
$this->handle_increment++;
}
}
do{
$status = curl_multi_exec($this->proc, $active);
}while($active && $status == CURLM_OK);
//
// if we reach this, we're done downloading garbage
//
foreach($this->handles as $category => $v){
foreach($v as $index => $data){
foreach($this->handles[$category][$index] as $sublinkindex => $handle){
preg_match(
'/location: ?(.*)$/im',
curl_multi_getcontent($handle),
$location
);
if(isset($location[1])){
if($sublinkindex === 0){
$collection[$category][$index]["url"] = trim($location[1]);
}else{
$collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]);
}
}
curl_multi_remove_handle($this->proc, $handle);
curl_close($handle);
}
}
}
curl_multi_close($this->proc);
}
private function resolve_images($proxy, &$data){
// get the image viewer that contains all of the images direct URLs
// for some reason, getting the second image's url in the set
// doesnt trigger the captcha
if(
!isset($data["image"][1]["url"]) ||
preg_match(
'/^https:\/\/image\.baidu\.com\/search\/detail/',
$data["image"][1]["url"]
) === 0
){
// we have an already resolved image link, do nothing
return;
}
try{
$html =
$this->get(
$proxy,
$data["image"][1]["url"],
[]
);
}catch(Exception $error){
// fallback to the limited dataset we have
return;
}
$this->fuckhtml->load($html);
$script =
$this->fuckhtml
->getElementById(
"image-detail-data",
"script"
);
if($script){
$json =
json_decode(
$script["innerHTML"],
true
);
if(
!isset($json["data"]["images"]) ||
count($json["data"]["images"]) === 0
){
// do nothing
return;
}
//
// Discard all previously scraped images and use data
// from the newly downloaded image carousel
// the imageset !!should!! be the same
//
$data["image"] = [];
foreach($json["data"]["images"] as $image){
parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
$data["image"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$image["titleShow"]
),
"source" => [
[
"url" => $image["objurl"],
"width" => (int)$image["width"],
"height" => (int)$image["height"]
],
[ // thumbnail
"url" => $image["thumburl"],
"width" => (int)$thumb_size["w"],
"height" => (int)$thumb_size["h"]
]
],
"url" => $image["fromUrl"]
];
}
}
}
public function web($get){
if($get["npt"]){
[$json, $proxy] = $this->backend->get($get["npt"], "web");
$json = json_decode($json, true);
$this->cookie = $json["cookie"];
$npt_data = $json["req"];
$npt_data["pn"] = $npt_data["pn"] + 20;
try{
$html = $this->get(
$proxy,
"https://www.baidu.com/s",
$npt_data
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
//
// Get authentication token
//
$proxy = $this->backend->get_ip();
// running this will give us shit in $this->cookie
// @TODO probably not needed? I get blocked anyways ffs
//$this->get($proxy, "https://www.baidu.com", []);
$npt_data = [
"wd" => $get["s"],
"rn" => 20
];
// &gpc=stf%3D0%2C1752638400|stftype%3D2
if(
$get["older"] !== false ||
$get["newer"] !== false
){
if($get["older"] === false){
$get["older"] = 0;
}
$npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2";
}
try{
$html = $this->get(
$proxy,
"https://www.baidu.com/s",
$npt_data
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
$npt_data["pn"] = 0;
}
return $this->parse_search($proxy, "web", $npt_data, $html);
}
private function parse_search($proxy, $pagetype, $npt_data, $html){
// @HACK
// remove newlines from the html, cause it fucks with fuckhtml
$html = str_replace(["\n", "\r"], "", $html);
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$this->fuckhtml->load($html);
$this->detect_ass();
$datafields =
$this->fuckhtml
->getElementsByAttributeName(
"id",
"div"
);
//
// Get next page
//
$npt =
$this->fuckhtml
->getElementsByClassName(
"n",
"a"
);
if(count($npt) !== 0){
$out["npt"] =
$this->backend->store(
json_encode([
"req" => $npt_data,
"cookie" => $this->cookie
]),
$pagetype,
$proxy
);
}
//
// Get related searches
//
$related_container =
$this->fuckhtml
->getElementById(
"rs_new",
$datafields
);
if($related_container){
$this->fuckhtml->load($related_container);
$as =
$this->fuckhtml
->getElementsByClassName(
"c-color-link",
"a"
);
foreach($as as $a){
$text =
explode(
">",
$this->fuckhtml
->getTextContent(
$a
),
2
);
$out["related"][] = $text[count($text) - 1];
}
}
foreach($datafields as $datafield){
if(
!isset($datafield["attributes"]["id"]) ||
preg_match(
'/^[0-9]+$/',
$datafield["attributes"]["id"]
) === 0
){
// not a search result
continue;
}
$this->fuckhtml->load($datafield);
$div =
$this->fuckhtml
->getElementsByTagName(
"div"
);
//
// Don't parse as a search result if it's a card
//
$card =
$this->fuckhtml
->getElementsByClassName(
"cosc-card",
$div
);
if(count($card) !== 0){
//
// Parse chinese youtube shorts
//
$ytshorts_probe =
$this->fuckhtml
->getElementsByClassName(
"tts-b-item",
$div
);
if(count($ytshorts_probe) !== 0){
$videos =
$this->fuckhtml
->getElementsByAttributeValue(
"data-show",
"list",
$div
);
foreach($videos as $video){
$this->fuckhtml->load($video);
$title =
$this->fuckhtml
->getElementsByClassName(
"cosc-title-slot",
"span"
);
if(count($title) === 0){
continue;
}
$url =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(count($url) === 0){
continue;
}
$image =
$this->fuckhtml
->getElementsByClassName(
"cos-image-body",
"img"
);
if(count($image) === 0){
$image = [
"ratio" => null,
"url" => null
];
}else{
$image = [
"ratio" => "1:1",
"url" =>
$this->fuckhtml
->getTextContent(
$image[0]["attributes"]["src"]
)
];
}
// get duration
$divs =
$this->fuckhtml
->getElementsByAttributeName(
"class",
"div"
);
$duration = null;
foreach($divs as $probe){
if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){
$duration =
$this->hms2int(
$this->fuckhtml
->getTextContent(
$probe
)
);
break;
}
}
$out["video"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$title[0]
),
"description" => null,
"date" => null,
"duration" => $duration,
"views" => null,
"thumb" => $image,
"url" =>
$this->fuckhtml
->getTextContent(
$url[0]["attributes"]["href"]
)
];
}
}
//
// Parse image carousel
//
$is_image_carousel = false;
foreach($div as $d){
if(
isset($d["attributes"]["class"]) &&
strpos($d["attributes"]["class"], "image-container") !== false
){
$is_image_carousel = true;
break;
}
}
if($is_image_carousel){
preg_match(
'/<!--s-data:([\S\s]*)-->/U',
$datafield["innerHTML"],
$matches
);
if(isset($matches[1])){
// weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong
$json =
$this->fuckhtml
->parseJsObject(
str_replace(
"-\-",
"--",
$matches[1]
)
);
if(
$json !== null &&
isset($json["imageList"][0]["images"])
){
// parse image carousel
foreach($json["imageList"][0]["images"] as $image){
parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
$out["image"][] = [
"title" => "image",
"source" => [
[
"url" => $image["objurl"],
"width" => (int)$image["width"],
"height" => (int)$image["height"]
],
[ // thumbnail
"url" => $image["thumburl"],
"width" => (int)$thumb_size["w"],
"height" => (int)$thumb_size["h"]
]
],
"url" => $image["jumpUrl"]
];
}
}
}
}
continue;
}
if(!isset($datafield["attributes"]["mu"])){
// dont scrape if we dont have the direct link
continue;
}
// class:FYB_RD -> News garbage, IGNORE
$result =
$this->fuckhtml
->getElementsByClassName(
"result",
[$datafield]
);
if(count($result) !== 0){
//
// Parse normal search result
//
$title =
$this->fuckhtml
->getElementsByClassName(
"sc-link",
"a"
);
if(count($title) === 0){
// should not happen
continue;
}
$title =
$this->titledots(
$this->fuckhtml
->getTextContent(
$title[0]
)
);
$description =
$this->fuckhtml
->getElementsByClassName(
"c-color",
$div
);
if(count($description) !== 0){
$this->fuckhtml->load($description[0]);
$description =
$this->fuckhtml
->getElementsByAttributeName(
"class",
"span"
);
$found_desc = false;
foreach($description as $desc){
if(stripos($desc["attributes"]["class"], "summary-text") !== false){
$found_desc = true;
$description =
$this->titledots(
$this->fuckhtml
->getTextContent(
$desc
)
);
break;
}
}
if($found_desc === false){
$description = null;
}
$this->fuckhtml->load($datafield);
}else{
$description = null;
}
// parse date
$date_probe =
$this->fuckhtml
->getElementsByClassName(
"cos-color-text-minor",
"span"
);
if(count($date_probe) !== 0){
$date =
$this->parse_time(
$this->fuckhtml
->getTextContent(
$date_probe[0]
)
);
}else{
$date = null;
}
// parse image
$img =
$this->fuckhtml
->getElementsByTagName(
"img"
);
if(count($img) !== 0){
$image = [
"ratio" => "16:9",
"url" =>
$this->unfuckthumb(
$this->fuckhtml
->getTextContent(
$img[0]["attributes"]["src"]
)
)
];
}else{
$image = [
"ratio" => null,
"url" => null
];
}
// get page type
$pagetype_probe =
$this->fuckhtml
->getElementsByTagName(
"b"
);
$pagetype = "web";
foreach($pagetype_probe as $probe){
$pagetype =
strtolower(
trim(
$this->fuckhtml
->getTextContent(
$probe
),
" 【】"
)
);
}
// get extra links
$sublinks = [];
foreach($div as $d){
if(
isset($d["attributes"]["class"]) &&
strpos($d["attributes"]["class"], "exta-link") !== false
){
$this->fuckhtml->load($d);
$links =
$this->fuckhtml
->getElementsByClassName(
"cos-space-mt-xs",
"div"
);
foreach($links as $link){
$this->fuckhtml->load($link);
$s_title =
$this->fuckhtml
->getElementsByTagName(
"h3"
);
if(count($s_title) === 0){
// should not happen
continue;
}
$data2 =
json_decode(
$this->fuckhtml
->getTextContent(
$s_title[0]["attributes"]["data-click"]
),
true
);
if(!isset($data2["clk_info"])){
// wtf
continue;
}
$data2 =
json_decode(
$data2["clk_info"],
true
);
if(!isset($data2["url"])){
// no link, fuck off
continue;
}
$url =
rawurldecode(
$data2["url"]
);
$data =
$this->fuckhtml
->getElementsByTagName(
"p"
);
$s_description = null;
if(count($data) !== 0){
$data =
json_decode(
$this->fuckhtml
->getTextContent(
$data[0]["attributes"]["sub-show-log"]
),
true
);
if(isset($data["ext"]["content"])){
$s_description = $data["ext"]["content"];
}
}
$sublinks[] = [
"title" =>
$this->fuckhtml
->getTextContent(
$s_title[0]
),
"description" => $s_description,
"url" => $url,
"date" => null
];
}
break;
}
}
$out["web"][] = [
"title" => $title,
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$datafield["attributes"]["mu"]
),
"date" => $date,
"type" => $pagetype,
"thumb" => $image,
"sublink" => $sublinks,
"table" => []
];
continue;
}
// parse special result
$result =
$this->fuckhtml
->getElementsByClassName(
"result-op",
[$datafield]
);
if(count($result) !== 0){
//
// Parse video carousel
//
if(
isset($datafield["attributes"]["tpl"]) &&
stripos($datafield["attributes"]["tpl"], "video") !== false
){
preg_match(
'/<!--s-data:([\S\s]*)-->/U',
$datafield["innerHTML"],
$matches
);
if(isset($matches[1])){
$json =
json_decode(
$matches[1],
true
);
if($json !== null){
foreach($json["videoList"] as $video){
$out["video"][] = [
"title" => $video["title"],
"description" =>
$this->titledots(
$video["desc"]
),
"date" =>
$this->parse_time(
$video["pubTime"]
),
"duration" =>
$this->hms2int(
$video["duration"]
),
"views" =>
$this->parse_viewcount(
$video["playCount"]
),
"thumb" => [
"ratio" => "16:9",
"url" => $video["poster"]
],
"url" => $video["bindProps"]["link"]
];
}
}
}
continue;
}
//
// Special result div (wiki entries, rich divs)
//
$title =
$this->fuckhtml
->getElementsByTagName(
"h3"
);
if(count($title) === 0){
// should have a title somewhere
continue;
}
$title =
explode(
">",
$this->fuckhtml
->getTextContent(
$title[0]
),
2
);
if(count($title) === 2){
$title = $title[1];
}else{
$title = $title[0];
}
// probe for wiki-like entry
$description =
$this->fuckhtml
->getElementsByClassName(
"sc-paragraph",
"p"
);
if(count($description) === 0){
// try and get grey description
$description =
$this->fuckhtml
->getElementsByClassName(
"c-color-gray2",
"p"
);
if(count($description) === 0){
// probe for special social media description
$description =
$this->fuckhtml
->getElementsByClassName(
"c-color-text",
"div"
);
if(isset($description[0]["attributes"]["aria-label"])){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
["attributes"]
["aria-label"]
);
}else{
// check for news tab description
$span =
$this->fuckhtml
->getElementsByClassName(
"c-font-normal",
"span"
);
$description = null;
foreach($span as $s){
if(isset($s["attributes"]["aria-label"])){
$description =
$this->titledots(
$this->fuckhtml
->getTextContent(
$span[count($span) - 1]
)
);
break;
}
}
}
}else{
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}
}else{
preg_match(
'/<!--s-text-->([\S\s]*)<!--\/s-text-->/U',
$description[count($description) - 1]["innerHTML"],
$matches
);
if(isset($matches[1])){
$description =
$this->titledots(
$this->fuckhtml
->getTextContent(
$matches[1]
)
);
}else{
$description = null;
}
}
// get thumbnail
$thumb =
$this->fuckhtml
->getElementsByTagName(
"img"
);
if(count($thumb) !== 0){
$thumb = [
"ratio" => "1:1",
"url" =>
$this->unfuckthumb(
$this->fuckhtml
->getTextContent(
$thumb[0]["attributes"]["src"]
)
)
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
// get sublinks
preg_match(
'/<!--s-data:([\S\s]*)-->/U',
$datafield["innerHTML"],
$matches
);
$sublinks = [];
if(isset($matches[1])){
$json =
json_decode(
$matches[1],
true
);
if($json !== null){
if(isset($json["buttons"])){
foreach($json["buttons"] as $button){
$sublinks[] = [
"title" => $button["text"],
"description" => null,
"date" => null,
"url" => $button["url"]
];
}
}elseif(isset($json["mthreadList"])){
foreach($json["mthreadList"] as $thread){
$sublinks[] = [
"title" =>
$this->fuckhtml
->getTextContent(
$thread["title"]
),
"description" => null,
"date" => null,
"url" => $thread["ttsInfo"]["titleUrl"]
];
}
}
}
}
// get URL
// handle http://fakeurl.baidu.com bullshit
$url =
$this->fuckhtml
->getTextContent(
$datafield["attributes"]["mu"]
);
if(
preg_match(
'/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/',
$url
)
){
// we got some bullshit, get jumpUrl instead
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(count($as) !== 0){
$url =
$this->fuckhtml
->getTextContent(
$as[0]["attributes"]["href"]
);
}
}
// get xueshu sublinks
// get list
$xueshu_list =
$this->fuckhtml
->getElementsByClassName(
"op-xueshu-links-d20-list",
$div
);
if(count($xueshu_list) !== 0){
$this->fuckhtml->load($xueshu_list[0]);
$rows =
$this->fuckhtml
->getElementsByClassName(
"c-row",
"div"
);
// remove "read more" bullshit
foreach($rows as $row){
if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
$xueshu_list[0]["innerHTML"] =
str_replace(
$row["outerHTML"],
"",
$xueshu_list[0]["innerHTML"]
);
}
}
$this->fuckhtml->load($xueshu_list[0]);
foreach($rows as $row){
$this->fuckhtml->load($row);
if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
continue;
}
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
foreach($as as $a){
$sublinks[] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$a
)
),
"description" => null,
"date" => null,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
)
];
}
}
}
$out["web"][] = [
"title" => $title,
"description" => $description,
"url" => $url,
"date" => null,
"type" => "web",
"thumb" => $thumb,
"sublink" => $sublinks,
"table" => []
];
continue;
}
}
//
// Remove tracking URLs and fetch additonal image resources
//
$this->resolve_urls($proxy, $out, ["web", "video"]);
$this->resolve_images($proxy, $out);
return $out;
}
public function image($get){
// https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1
//$json = file_get_contents("scraper/baidu_img.json");
if($get["npt"]){
[$params, $proxy] = $this->backend->get($get["npt"], "images");
$params = json_decode($params, true);
$params["pn"] = $params["pn"] + 60;
}else{
$proxy = $this->backend->get_ip();
$params = [
"word" => $get["s"],
"rn" => 60, // results/page
"pn" => 0, // item increment (0 * 60)
"newReq" => 1 // otherwise json is fucked up
];
switch($get["sort"]){
case "latest": $params["latest"] = 1; break;
case "hot": $params["hot"] = 1; break;
}
if($get["size"] != "any"){
$params["z"] = $get["size"];
}
if($get["ratio"] != "any"){
$params["imgratio"] = $get["ratio"];
}
if($get["format"] != "any"){
$params["imgformat"] = $get["format"];
}
if($get["color"] != "any"){
$params["ic"] = $get["color"];
}
switch($get["type"]){
case "hd": $params["hd"] = 1; break;
case "isImgSet": $params["isImgSet"] = 1; break;
case "copyright": $params["copyright"] = 1; break;
}
}
try{
$json =
$this->get(
$proxy,
"https://image.baidu.com/search/acjson",
$params,
"https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"])
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$json = json_decode($json, true);
if($json === null){
// detect captcha first
$this->fuckhtml->load($json);
$this->detect_ass();
// fallback to json decode error
throw new Exception("Failed to decode JSON");
}
if(
isset($json["message"]) &&
$json["message"] != "success"
){
throw new Exception("Baidu returned an error: {$json["message"]}");
}
if(!isset($json["data"]["images"])){
throw new Exception("Baidu did not return an image object");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
foreach($json["data"]["images"] as $image){
parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
$out["image"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$image["titleShow"]
),
"source" => [
[
"url" => $image["objurl"],
"width" => (int)$image["width"],
"height" => (int)$image["height"]
],
[ // thumbnail
"url" => $image["thumburl"],
"width" => (int)$thumb_size["w"],
"height" => (int)$thumb_size["h"]
]
],
"url" => $image["fromUrl"]
];
}
//
// Detect if there's a next page
//
if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){
$out["npt"] =
$this->backend->store(
json_encode($params),
"images",
$proxy
);
}
return $out;
}
public function video($get){
// https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0
// increase &pn +20 for pagination
//$html = file_get_contents("scraper/baidu_vid.html");
if($get["npt"]){
[$params, $proxy] = $this->backend->get($get["npt"], "videos");
$params = json_decode($params, true);
$params["pn"] = $params["pn"] + 10;
}else{
$proxy = $this->backend->get_ip();
$params = [
"pd" => "video",
"tn" => "vsearch",
"wd" => $get["s"],
"async" => 1,
"pn" => 0
];
}
try{
$html =
$this->get(
$proxy,
"https://www.baidu.com/sf/vsearch",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get search page");
}
$html =
str_replace(
["\r", "\n"],
"",
$html
);
$out = [
"status" => "ok",
"npt" => null,
"video" => [],
"author" => [],
"livestream" => [],
"playlist" => [],
"reel" => []
];
$html = explode("<script>", $html);
foreach($html as $result){
$result = trim($result);
$this->fuckhtml->load($result);
// get URL
preg_match(
'/<!-- *([^ ]*) *-->/',
$result,
$matches
);
if(!isset($matches[1])){
// no link, give up
continue;
}
$link = $matches[1];
// get title
$title =
$this->fuckhtml
->getElementsByClassName(
"video-title",
"a"
);
if(count($title) === 0){
// should not happen
continue;
}
$title =
$this->fuckhtml
->getTextContent(
$title[0]
);
// get thumbnail
$img =
$this->fuckhtml
->getElementsByClassName(
"border-radius",
"img"
);
if(count($img) !== 0){
$thumb = [
"url" =>
$this->unfuckthumb(
$this->fuckhtml
->getTextContent(
$img[0]["attributes"]["src"]
)
),
"ratio" => "16:9"
];
}else{
$thumb = [
"url" => null,
"ratio" => null
];
}
$span =
$this->fuckhtml
->getElementsByTagName(
"span"
);
// get duration
$duration =
$this->fuckhtml
->getElementsByClassName(
"video_play_timer",
$span
);
if(count($duration) !== 0){
$duration =
$this->hms2int(
$this->fuckhtml
->getTextContent(
$duration[0]
)
);
}else{
$duration = null;
}
// get author
// 来源:哔哩哔哩
$author =
$this->fuckhtml
->getElementsByClassName(
"wetSource",
$span
);
if(count($author) !== 0){
$author =
explode(
"",
$this->fuckhtml
->getTextContent(
$author[0]
),
2
)[1];
}else{
$author = null;
}
// get date posted
//发布时间2024-05-06
// AND get description
// 简介Our first look
$infospans =
array_merge(
$this->fuckhtml
->getElementsByClassName(
"c-font-normal",
$span
),
$this->fuckhtml
->getElementsByClassName(
"c-font-normal",
"div"
)
);
$date = null;
$description = null;
foreach($infospans as $infospan){
$infospan =
explode(
"",
$this->fuckhtml
->getTextContent(
$infospan
),
2
);
if(count($infospan) !== 2){
// should not happen
continue;
}
$infospan[1] =
$this->fuckhtml
->getTextContent(
$infospan[1]
);
switch($infospan[0]){
case "发布时间": // date posted
$date = $this->parse_time($infospan[1]);
break;
case "简介": // description
$description = $infospan[1];
break;
}
}
$out["video"][] = [
"title" => $this->titledots($title),
"description" => $this->titledots($description),
"author" => [
"name" => $author,
"url" => null,
"avatar" => null
],
"date" => $date,
"duration" => $duration,
"views" => null,
"thumb" => $thumb,
"url" => $link
];
}
if(count($out["video"]) === 10){
// assume there's another page after this
$out["npt"] =
$this->backend->store(
json_encode($params),
"videos",
$proxy
);
}
return $out;
}
public function news($get){
//$proxy = $this->backend->get_ip();
//$html = file_get_contents("scraper/baidu.html");
//$npt_data = [];
if($get["npt"]){
[$json, $proxy] = $this->backend->get($get["npt"], "news");
$json = json_decode($json, true);
$this->cookie = $json["cookie"];
$npt_data = $json["req"];
$npt_data["pn"] = $npt_data["pn"] + 20;
try{
$html = $this->get(
$proxy,
"https://www.baidu.com/s",
$npt_data
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
//
// Get authentication token
//
$proxy = $this->backend->get_ip();
$npt_data = [
"wd" => $get["s"],
"rn" => 20,
"tn" => "news"
];
// @TODO add filters
try{
$html = $this->get(
$proxy,
"https://www.baidu.com/s",
$npt_data
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
$npt_data["pn"] = 0;
}
$data = $this->parse_search($proxy, "news", $npt_data, $html);
$out = [
"status" => "ok",
"npt" => $data["npt"],
"news" => []
];
foreach($data["web"] as $article){
$out["news"][] = [
"title" => $article["title"],
"author" => null,
"description" => $article["description"],
"date" => $article["date"],
"thumb" => [
"url" => $article["thumb"]["url"],
"ratio" => $article["thumb"]["url"] !== null ? "16:9" : null,
],
"url" => $article["url"]
];
}
return $out;
}
private function unfuckthumb($url){
// probe for proxy URL
$parsed_url = parse_url($url);
if(
preg_match(
'/^https?:\/\/gimg(?:[0-9]+)?\.baidu\.com/',
$url
)
){
$parts = explode("src=", $url);
if(count($parts) !== 2){
// shits fucked
return $url;
}
return urldecode(explode("&", $parts[1])[0]);
}
$q = explode("&", $url, 2);
if(count($q) !== 2){
// shits fucked, again
return $url;
}
// baidu devs are fucking retarded and dont follow spec:
// &fmt=auto?s=BB32F3A050471AEC72886934030090C4&sec=1753203600&t=0fb2194775d3bd3d1bb114b818479e0a
parse_str(str_replace("?", "&", $q[1]), $query);
if(isset($query["size"])){ unset($query["size"]); }
if(isset($query["q"])){ $query["q"] = "90"; }
$query = http_build_query($query);
return
str_replace(
$q[1],
$query,
$url
);
}
private function titledots($title){
return trim($title, " .\t\n\r\0\x0B");
}
private function hms2int($time){
$parts = explode(":", $time, 3);
$time = 0;
if(count($parts) === 3){
// hours
$time = $time + ((int)$parts[0] * 3600);
array_shift($parts);
}
if(count($parts) === 2){
// minutes
$time = $time + ((int)$parts[0] * 60);
array_shift($parts);
}
// seconds
$time = $time + (int)$parts[0];
return $time;
}
private function parse_viewcount($views){
if(
// 10k (wtf lol)
preg_match(
'/([0-9]+)万次/',
$views,
$matches
)
){
return (int)$matches[1] * 10000;
}
if(
// units
preg_match(
'/([0-9]+)次/',
$views,
$matches
)
){
return (int)$matches[1];
}
return null;
}
private function parse_time($time){
// 2023年8月7日 => yyyy/m/d
if(
preg_match(
'/([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日/',
$time,
$matches
)
){
return strtotime("{$matches[1]}/{$matches[2]}/{$matches[3]}");
}
// 昨天11:45 => yesterday at 11:45
// 昨天 => yesterday
if(
preg_match(
'/昨天(.*)/',
$time,
$matches
)
){
return strtotime("Yesterday {$matches[1]}");
}
// 3天前 => 3 days ago
if(
preg_match(
'/([0-9]{1,4})天前/',
$time,
$matches
)
){
return strtotime("{$matches[1]} days ago");
}
// 1个月前 => 1 month ago
if(
preg_match(
'/([0-9]{1,4})个月前/',
$time,
$matches
)
){
return strtotime("{$matches[1]} months ago");
}
// attempt to parse as-is
$time = strtotime($time);
if($time !== false){
return $time;
}
return null;
}
private function detect_ass(){
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(
count($as) === 0 ||
preg_match(
'/^https?:\/\/wappass\.baidu\.com\/static\/captcha/',
$this->fuckhtml
->getTextContent(
$as[0]["attributes"]["href"]
)
)
){
throw new Exception("Baidu returned a Captcha");
}
}
}