forked from lolcat/4get
1
0
Fork 0
4get/scraper/brave.php

1840 lines
36 KiB
PHP
Raw Permalink Normal View History

2023-07-22 18:41:14 +00:00
<?php
class brave{
public function __construct(){
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
2023-11-07 13:04:56 +00:00
include "lib/backend.php";
$this->backend = new backend("brave");
2023-07-22 18:41:14 +00:00
}
public function getfilters($page){
switch($page){
case "web":
return [
"country" => [
"display" => "Country",
"option" => [
"all" => "All Regions",
"ar" => "Argentina",
"au" => "Australia",
"at" => "Austria",
"be" => "Belgium",
"br" => "Brazil",
"ca" => "Canada",
"cl" => "Chile",
"cn" => "China",
"dk" => "Denmark",
"fi" => "Finland",
"fr" => "France",
"de" => "Germany",
"hk" => "Hong Kong",
"in" => "India",
"id" => "Indonesia",
"it" => "Italy",
"jp" => "Japan",
"kr" => "Korea",
"my" => "Malaysia",
"mx" => "Mexico",
"nl" => "Netherlands",
"nz" => "New Zealand",
"no" => "Norway",
"pl" => "Poland",
"pt" => "Portugal",
"ph" => "Philippines",
"ru" => "Russia",
"sa" => "Saudi Arabia",
"za" => "South Africa",
"es" => "Spain",
"se" => "Sweden",
"ch" => "Switzerland",
"tw" => "Taiwan",
"tr" => "Turkey",
"gb" => "United Kingdom",
"us" => "United States"
]
],
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"maybe" => "Maybe",
"no" => "No"
]
],
"newer" => [
"display" => "Newer than",
"option" => "_DATE"
],
"older" => [
"display" => "Older than",
"option" => "_DATE"
2023-09-06 12:43:22 +00:00
],
"spellcheck" => [
"display" => "Spellcheck",
"option" => [
"yes" => "Yes",
"no" => "No"
2023-09-06 12:43:22 +00:00
]
2023-07-22 18:41:14 +00:00
]
];
break;
2023-08-08 07:09:47 +00:00
case "images":
case "videos":
2023-07-22 18:41:14 +00:00
case "news":
return [
"country" => [
"display" => "Country",
"option" => [
"all" => "All regions",
"ar" => "Argentina",
"au" => "Australia",
"at" => "Austria",
"be" => "Belgium",
"br" => "Brazil",
"ca" => "Canada",
"cl" => "Chile",
"cn" => "China",
"dk" => "Denmark",
"fi" => "Finland",
"fr" => "France",
"de" => "Germany",
"hk" => "Hong Kong",
"in" => "India",
"id" => "Indonesia",
"it" => "Italy",
"jp" => "Japan",
"kr" => "Korea",
"my" => "Malaysia",
"mx" => "Mexico",
"nl" => "Netherlands",
"nz" => "New Zealand",
"no" => "Norway",
"pl" => "Poland",
"pt" => "Portugal",
"ph" => "Philippines",
"ru" => "Russia",
"sa" => "Saudi Arabia",
"za" => "South Africa",
"es" => "Spain",
"se" => "Sweden",
"ch" => "Switzerland",
"tw" => "Taiwan",
"tr" => "Turkey",
"gb" => "United Kingdom",
"us" => "United States"
]
],
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"maybe" => "Maybe",
"no" => "No"
]
2023-11-07 13:04:56 +00:00
],
"spellcheck" => [
"display" => "Spellcheck",
"option" => [
"yes" => "Yes",
"no" => "No"
]
2023-07-22 18:41:14 +00:00
]
];
break;
}
}
2023-11-07 13:04:56 +00:00
private function get($proxy, $url, $get = [], $nsfw, $country){
2023-07-22 18:41:14 +00:00
switch($nsfw){
case "yes": $nsfw = "off"; break;
case "maybe": $nsfw = "moderate"; break;
case "no": $nsfw = "strict"; break;
}
2023-08-11 02:54:37 +00:00
if($country == "any"){
$country = "all";
}
2023-07-22 18:41:14 +00:00
$headers = [
2023-11-07 13:04:56 +00:00
"User-Agent: " . config::USER_AGENT,
2023-07-22 18:41:14 +00:00
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Cookie: safesearch={$nsfw}; country={$country}; useLocation=0; summarizer=0",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
2023-08-08 07:09:47 +00:00
"Sec-Fetch-User: ?1"
2023-07-22 18:41:14 +00:00
];
$curlproc = curl_init();
2023-08-08 07:09:47 +00:00
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
2023-07-22 18:41:14 +00:00
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
2023-11-07 13:04:56 +00:00
$this->backend->assign_proxy($curlproc, $proxy);
2023-07-22 18:41:14 +00:00
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
// get next page data
2023-11-07 13:04:56 +00:00
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$q = json_decode($q, true);
2023-07-22 18:41:14 +00:00
$search = $q["q"];
2023-09-06 12:43:22 +00:00
$q["spellcheck"] = "0";
2023-07-22 18:41:14 +00:00
$nsfw = $q["nsfw"];
unset($q["nsfw"]);
$country = $q["country"];
unset($q["country"]);
}else{
// get _GET data instead
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
2023-11-07 13:04:56 +00:00
throw new Exception("Search term is too long!");
2023-07-22 18:41:14 +00:00
}
2023-11-07 13:04:56 +00:00
$proxy = $this->backend->get_ip();
2023-07-22 18:41:14 +00:00
$nsfw = $get["nsfw"];
$country = $get["country"];
$older = $get["older"];
$newer = $get["newer"];
2023-09-06 12:43:22 +00:00
$spellcheck = $get["spellcheck"];
2023-07-22 18:41:14 +00:00
$q = [
"q" => $search
];
/*
Pass older/newer filters to brave
*/
if($newer !== false){
$newer = date("Y-m-d", $newer);
if($older === false){
$older = date("Y-m-d", time());
}
}
if(
is_string($older) === false &&
$older !== false
){
$older = date("Y-m-d", $older);
if($newer === false){
$newer = "1970-01-02";
}
}
if($older !== false){
$q["tf"] = "{$newer}to{$older}";
}
2023-09-06 12:43:22 +00:00
// spellcheck
if($spellcheck == "no"){
$q["spellcheck"] = "0";
2023-09-06 12:43:22 +00:00
}
2023-07-22 18:41:14 +00:00
}
2023-09-06 12:49:11 +00:00
/*
2023-07-22 18:41:14 +00:00
$handle = fopen("scraper/brave.html", "r");
$html = fread($handle, filesize("scraper/brave.html"));
fclose($handle);
2023-09-06 12:49:11 +00:00
*/
2023-07-22 18:41:14 +00:00
try{
$html =
$this->get(
2023-11-07 13:04:56 +00:00
$proxy,
2023-07-22 18:41:14 +00:00
"https://search.brave.com/search",
$q,
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
2023-09-06 12:49:11 +00:00
2023-07-22 18:41:14 +00:00
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// load html
$this->fuckhtml->load($html);
/*
Get next page "token"
*/
$nextpage =
$this->fuckhtml
->getElementById(
2023-09-06 12:43:22 +00:00
"pagination",
"div"
2023-07-22 18:41:14 +00:00
);
2023-09-06 12:43:22 +00:00
if($nextpage){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$this->fuckhtml->load($nextpage);
$nextpage =
$this->fuckhtml
->getElementsByClassName("btn", "a");
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(count($nextpage) !== 0){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$nextpage =
$nextpage[count($nextpage) - 1];
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(
strtolower(
$this->fuckhtml
->getTextContent(
$nextpage
2023-07-22 18:41:14 +00:00
)
2023-09-06 12:43:22 +00:00
) == "next"
){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
preg_match(
'/offset=([0-9]+)/',
$this->fuckhtml->getTextContent($nextpage["attributes"]["href"]),
$nextpage
);
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$q["offset"] = (int)$nextpage[1];
$q["nsfw"] = $nsfw;
$q["country"] = $country;
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$out["npt"] =
2023-11-07 13:04:56 +00:00
$this->backend->store(
2023-09-06 12:43:22 +00:00
json_encode($q),
2023-11-07 13:04:56 +00:00
"web",
$proxy
2023-09-06 12:43:22 +00:00
);
2023-07-22 18:41:14 +00:00
}
}
}
2023-09-06 12:43:22 +00:00
$this->fuckhtml->load($html);
$script_disc =
2023-07-22 18:41:14 +00:00
$this->fuckhtml
2023-09-06 12:43:22 +00:00
->getElementsByTagName(
"script"
);
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$grep = [];
foreach($script_disc as $discs){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
preg_match(
'/const data ?= ?(\[{.*}]);/',
$discs["innerHTML"],
$grep
);
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(isset($grep[1])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
break;
2023-07-22 18:41:14 +00:00
}
}
2023-09-06 12:43:22 +00:00
if(!isset($grep[1])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
throw new Exception("Could not get data JS");
2023-07-22 18:41:14 +00:00
}
2023-09-06 12:43:22 +00:00
$data =
$this->fuckhtml
->parseJsObject(
$grep[1]
);
unset($grep);
$data = $data[1]["data"]["body"]["response"];
2023-07-22 18:41:14 +00:00
/*
Get web results
*/
2023-09-06 12:43:22 +00:00
if(!isset($data["web"]["results"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
return $out;
}
foreach($data["web"]["results"] as $result){
2023-07-22 18:41:14 +00:00
2023-09-09 03:32:54 +00:00
if(
isset($result["thumbnail"]) &&
is_array($result["thumbnail"])
){
2023-09-06 12:43:22 +00:00
$thumb = [
"ratio" => $result["thumbnail"]["logo"] == "false" ? "16:9" : "1:1",
"url" => $result["thumbnail"]["original"]
];
}else{
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$thumb = [
"ratio" => null,
"url" => null
];
2023-07-22 18:41:14 +00:00
}
2023-09-10 01:45:07 +00:00
// get sublinks
2023-09-06 12:43:22 +00:00
$sublink = [];
2023-09-09 03:32:54 +00:00
if(
isset($result["cluster"]) &&
is_array($result["cluster"])
){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
foreach($result["cluster"] as $cluster){
$sublink[] = [
"title" => $this->titledots($cluster["title"]),
"description" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$cluster["description"]
)
),
"url" => $cluster["url"],
"date" => null
];
}
2023-07-22 18:41:14 +00:00
}
2023-09-10 01:45:07 +00:00
// more sublinks
if(
isset($result["deep_results"]) &&
is_array($result["deep_results"])
){
foreach($result["deep_results"]["buttons"] as $r){
$sublink[] = [
"title" => $this->titledots($r["title"]),
"description" => null,
"url" => $r["url"],
"date" => null
];
}
}
2023-09-06 12:43:22 +00:00
// parse table elements
$table = [];
2023-09-10 01:45:07 +00:00
/*
[locations] => void 0 Done
[video] => void 0 Done
[movie] => void 0 Done
[faq] => void 0
[recipe] => void 0
[qa] => void 0 Not needed
[book] => void 0
[rating] => void 0
[article] => void 0
[product] => void 0 Done
[product_cluster] => void 0
[cluster_type] => void 0
[cluster] => void 0 Done
[creative_work] => void 0 Done
[music_recording] => void 0
[review] => void 0 Done
[software] => void 0 Done
[content_type] => void 0
[descriptionLength] => 271
*/
2023-09-06 12:43:22 +00:00
// product
2023-09-10 01:45:07 +00:00
// creative_work
2023-09-06 12:43:22 +00:00
$ref = null;
if(isset($result["product"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$ref = &$result["product"];
}elseif(isset($result["creative_work"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$ref = &$result["creative_work"];
2023-07-22 18:41:14 +00:00
}
2023-09-06 12:43:22 +00:00
if($ref !== null){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(isset($ref["offers"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
foreach($ref["offers"] as $offer){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$price = null;
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(isset($offer["price"])){
if((float)$offer["price"] == 0){
$price = "Free";
}else{
$price = $offer["price"];
}
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if($price !== "Free"){
if(isset($offer["priceCurrency"])){
$price .= " " . $offer["priceCurrency"];
}
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if($price !== null){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$table["Price"] = trim($price);
2023-07-22 18:41:14 +00:00
}
}
}
2023-09-06 12:43:22 +00:00
if(isset($ref["rating"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$rating = null;
if(isset($ref["rating"]["ratingValue"])){
$rating = $ref["rating"]["ratingValue"];
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(isset($ref["rating"]["bestRating"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$rating .= "/" . $ref["rating"]["bestRating"];
}
2023-07-22 18:41:14 +00:00
}
2023-09-06 12:43:22 +00:00
if(isset($ref["rating"]["reviewCount"])){
$isnull = $rating === null ? false : true;
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if($isnull){
$rating .= " (";
}
$rating .= number_format($ref["rating"]["reviewCount"]) . " hits";
if($isnull){
$rating .= ")";
}
2023-07-22 18:41:14 +00:00
}
2023-09-06 12:43:22 +00:00
if($rating !== null){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$table["Rating"] = $rating;
2023-07-22 18:41:14 +00:00
}
}
}
2023-09-10 01:45:07 +00:00
// review
if(
isset($result["review"]) &&
is_array($result["review"])
){
if(isset($result["review"]["rating"]["ratingValue"])){
$table["Rating"] =
$result["review"]["rating"]["ratingValue"] . "/" .
$result["review"]["rating"]["bestRating"];
}
}
// software
if(
isset($result["software"]) &&
is_array($result["software"])
){
if(isset($result["software"]["author"])){
$table["Author"] = $result["software"]["author"];
}
if(isset($result["software"]["stars"])){
$table["Stars"] = number_format($result["software"]["stars"]);
}
if(isset($result["software"]["forks"])){
$table["Forks"] = number_format($result["software"]["forks"]);
}
2023-09-10 01:48:44 +00:00
if(
isset($result["software"]["programmingLanguage"]) &&
$result["software"]["programmingLanguage"] != ""
){
2023-09-10 01:45:07 +00:00
$table["Programming languages"] = $result["software"]["programmingLanguage"];
}
}
// location
if(
isset($result["location"]) &&
is_array($result["location"])
){
if(isset($result["location"]["postal_address"]["displayAddress"])){
$table["Address"] = $result["location"]["postal_address"]["displayAddress"];
}
if(isset($result["location"]["rating"])){
$table["Rating"] =
$result["location"]["rating"]["ratingValue"] . "/" .
$result["location"]["rating"]["bestRating"] . " (" .
number_format($result["location"]["rating"]["reviewCount"]) . " votes)";
}
if(isset($result["location"]["contact"]["telephone"])){
$table["Phone number"] =
$result["location"]["contact"]["telephone"];
}
if(isset($result["location"]["price_range"])){
$table["Price"] =
$result["location"]["price_range"];
}
}
2023-09-06 12:43:22 +00:00
// video
2023-09-09 03:32:54 +00:00
if(
isset($result["video"]) &&
is_array($result["video"])
){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
foreach($result["video"] as $key => $value){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(is_string($result["video"][$key]) === false){
continue;
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$table[ucfirst($key)] = $value;
2023-07-22 18:41:14 +00:00
}
}
2023-09-10 01:45:07 +00:00
// movie
2023-09-06 12:43:22 +00:00
if(
2023-09-10 01:45:07 +00:00
isset($result["video"]) &&
is_array($result["movie"])
2023-09-06 12:43:22 +00:00
){
2023-07-22 18:41:14 +00:00
2023-09-10 01:45:07 +00:00
if(isset($result["movie"]["release"])){
$table["Release date"] = $result["movie"]["release"];
}
2023-09-06 12:43:22 +00:00
2023-09-10 01:45:07 +00:00
if(isset($result["movie"]["directors"])){
2023-07-22 18:41:14 +00:00
2023-09-10 01:45:07 +00:00
$directors = [];
foreach($result["movie"]["directors"] as $director){
$directors[] = $director["name"];
}
if(count($directors) !== 0){
$table["Directors"] = implode(", ", $directors);
}
2023-09-06 12:43:22 +00:00
}
2023-09-10 01:45:07 +00:00
if(isset($result["movie"]["actors"])){
$actors = [];
foreach($result["movie"]["actors"] as $actor){
$actors[] = $actor["name"];
}
if(count($actors) !== 0){
$table["Actors"] = implode(", ", $actors);
}
}
if(isset($result["movie"]["rating"])){
$table["Rating"] =
$result["movie"]["rating"]["ratingValue"] . "/" .
$result["movie"]["rating"]["bestRating"] . " (" .
number_format($result["movie"]["rating"]["reviewCount"]) . " votes)";
}
if(isset($result["movie"]["duration"])){
$table["Duration"] =
$result["movie"]["duration"];
}
if(isset($result["movie"]["genre"])){
$genres = [];
foreach($result["movie"]["genre"] as $genre){
$genres[] = $genre;
}
if(count($genres) !== 0){
$table["Genre"] = implode(", ", $genres);
}
}
}
if(
isset($result["age"]) &&
$result["age"] != "void 0" &&
$result["age"] != ""
){
$date = strtotime($result["age"]);
}else{
$date = null;
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$out["web"][] = [
"title" =>
$this->titledots(
$result["title"]
),
"description" =>
2023-09-10 01:45:07 +00:00
isset($result["review"]["description"]) ?
$this->limitstrlen(
2023-11-07 13:04:56 +00:00
strip_tags(
$result["review"]["description"]
)
2023-09-10 01:45:07 +00:00
) :
2023-07-22 18:41:14 +00:00
$this->titledots(
$this->fuckhtml
->getTextContent(
2023-09-06 12:43:22 +00:00
$result["description"]
2023-07-22 18:41:14 +00:00
)
2023-09-06 12:43:22 +00:00
),
"url" => $result["url"],
2023-09-10 01:45:07 +00:00
"date" => $date,
2023-09-06 12:43:22 +00:00
"type" => "web",
"thumb" => $thumb,
"sublink" => $sublink,
"table" => $table
2023-07-22 18:41:14 +00:00
];
}
/*
2023-09-06 12:43:22 +00:00
Get spelling autocorrect
2023-07-22 18:41:14 +00:00
*/
2023-09-06 12:43:22 +00:00
if(
isset($data["query"]["bo_altered_diff"][0][0]) &&
$data["query"]["bo_altered_diff"][0][0] == "true"
){
2023-09-10 01:45:07 +00:00
$using = [];
foreach($data["query"]["bo_altered_diff"] as $diff){
$using[] = $diff[1];
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$out["spelling"] = [
"type" => "including",
2023-09-10 01:45:07 +00:00
"using" => implode(" ", $using),
2023-09-06 12:43:22 +00:00
"correction" => $get["s"]
];
}
/*
Get wikipedia heads
*/
if(isset($data["infobox"]["results"][0])){
foreach($data["infobox"]["results"] as $info){
if($info["subtype"] == "code"){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$description =
$this->stackoverflow_parse($info["data"]["answer"]["text"]);
if(isset($info["data"]["answer"]["author"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$description[] = [
"type" => "quote",
"value" => "Answer from " . $info["data"]["answer"]["author"]
];
2023-07-22 18:41:14 +00:00
}
2023-09-06 12:43:22 +00:00
}else{
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$description = [];
if(
isset($info["description"]) &&
$info["description"] != ""
){
$description[] = [
2023-07-22 18:41:14 +00:00
"type" => "quote",
2023-09-06 12:43:22 +00:00
"value" => $info["description"]
2023-07-22 18:41:14 +00:00
];
}
2023-09-06 12:43:22 +00:00
if(
isset($info["long_desc"]) &&
$info["long_desc"] != ""
){
$description[] = [
"type" => "text",
2023-09-06 12:43:22 +00:00
"value" => $this->titledots($info["long_desc"])
2023-07-22 18:41:14 +00:00
];
2023-09-06 12:43:22 +00:00
}
2023-11-07 13:04:56 +00:00
// parse ratings
if(
isset($info["ratings"]) &&
$info["ratings"] != "void 0" &&
is_array($info["ratings"]) &&
count($info["ratings"]) !== 0
2023-11-07 13:04:56 +00:00
){
$description[] = [
"type" => "title",
"value" => "Ratings"
];
foreach($info["ratings"] as $rating){
$description[] = [
"type" => "link",
"url" => $rating["profile"]["url"],
"value" => $rating["profile"]["name"]
];
$description[] = [
"type" => "text",
"value" => ": " . $rating["ratingValue"] . "/" . $rating["bestRating"] . "\n"
];
}
}
2023-09-06 12:43:22 +00:00
}
$table = [];
if(isset($info["attributes"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
foreach($info["attributes"] as $row){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(
$row[1] == "null" &&
count($table) !== 0
){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
break;
}
if($row[1] == "null"){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
continue;
2023-07-22 18:41:14 +00:00
}
2023-09-06 12:43:22 +00:00
$table[
$this->fuckhtml->getTextContent($row[0])
] =
$this->fuckhtml->getTextContent($row[1]);
2023-07-22 18:41:14 +00:00
}
}
2023-09-06 12:43:22 +00:00
$sublink = [];
if(isset($info["profiles"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
foreach($info["profiles"] as $row){
$name = $this->fuckhtml->getTextContent($row["name"]);
if(strtolower($name) == "steampowered"){
$name = "Steam";
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$sublink[
$this->fuckhtml->getTextContent($name)
] =
$this->fuckhtml->getTextContent($row["url"]);
2023-07-22 18:41:14 +00:00
}
}
2023-09-06 12:43:22 +00:00
$out["answer"][] = [
"title" => $this->fuckhtml->getTextContent($info["title"]),
"description" => $description,
"url" => $info["url"],
"thumb" => isset($info["images"][0]["original"]) ? $info["images"][0]["original"] : null,
"table" => $table,
"sublink" => $sublink
];
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
break; // only iterate once, we get garbage most of the time
}
}
/*
Get videos
*/
if(isset($data["videos"]["results"])){
foreach($data["videos"]["results"] as $video){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$out["video"][] = [
"title" => $this->titledots($video["title"]),
"description" => $this->titledots($video["description"]),
2023-11-07 13:04:56 +00:00
"date" => isset($video["age"]) && $video["age"] != "void 0" ? strtotime($video["age"]) : null,
"duration" => isset($video["video"]["duration"]) && $video["video"]["duration"] != "void 0" ? $this->hms2int($video["video"]["duration"]) : null,
"views" => isset($video["video"]["views"]) && $video["video"]["views"] != "void 0" ? (int)$video["video"]["views"] : null,
2023-09-06 12:43:22 +00:00
"thumb" =>
isset($video["thumbnail"]["src"]) ?
[
"ratio" => "16:9",
"url" => $this->unshiturl($video["thumbnail"]["src"])
] :
[
"ratio" => null,
"url" => null
],
"url" => $video["url"]
2023-07-22 18:41:14 +00:00
];
}
2023-09-06 12:43:22 +00:00
}
/*
Get news
*/
if(isset($data["news"]["results"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
foreach($data["news"]["results"] as $news){
$out["news"][] = [
"title" => $this->titledots($news["title"]),
"description" => $this->titledots($news["description"]),
"date" => isset($news["age"]) ? strtotime($news["age"]) : null,
"thumb" =>
isset($video["thumbnail"]["src"]) ?
[
"ratio" => "16:9",
"url" => $this->unshiturl($video["thumbnail"]["src"])
] :
[
"ratio" => null,
"url" => null
],
"url" => $news["url"]
];
}
2023-07-22 18:41:14 +00:00
}
/*
2023-09-06 12:43:22 +00:00
Get discussions
2023-07-22 18:41:14 +00:00
*/
2023-09-06 12:43:22 +00:00
$disc_out = [];
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(isset($data["discussions"]["results"])){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
foreach($data["discussions"]["results"] as $disc){
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$table = [];
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(isset($disc["data"]["num_votes"])){
2023-09-09 03:32:54 +00:00
$table["Votes"] = number_format($disc["data"]["num_votes"]);
2023-09-06 12:43:22 +00:00
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
if(isset($disc["data"]["num_answers"])){
2023-09-09 03:32:54 +00:00
$table["Comments"] = number_format($disc["data"]["num_answers"]);
2023-09-06 12:43:22 +00:00
}
2023-07-22 18:41:14 +00:00
2023-09-06 12:43:22 +00:00
$disc_out[] = [
"title" =>
$this->titledots(
$disc["title"]
),
"description" =>
$this->limitstrlen(
$this->titledots(
$this->fuckhtml
->getTextContent(
$disc["description"]
)
)
),
"url" => $disc["url"],
"date" => isset($disc["age"]) ? strtotime($disc["age"]) : null,
"type" => "web",
"thumb" => [
"ratio" => null,
"url" => null
],
"sublink" => [],
"table" => $table
2023-07-22 18:41:14 +00:00
];
}
}
2023-09-06 12:43:22 +00:00
// append discussions at position 2
array_splice($out["web"], 1, 0, $disc_out);
2023-07-22 18:41:14 +00:00
return $out;
}
public function news($get){
2023-11-07 13:04:56 +00:00
if($get["npt"]){
2023-07-22 18:41:14 +00:00
2023-11-07 13:04:56 +00:00
[$req, $proxy] = $this->backend->get($get["npt"], "news");
2023-07-22 18:41:14 +00:00
2023-11-07 13:04:56 +00:00
$req = json_decode($req, true);
2023-07-22 18:41:14 +00:00
2023-11-07 13:04:56 +00:00
$search = $req["q"];
$country = $req["country"];
$nsfw = $req["nsfw"];
$offset = $req["offset"];
$spellcheck = $req["spellcheck"];
2023-07-22 18:41:14 +00:00
2023-11-07 13:04:56 +00:00
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/news",
[
"q" => $search,
"offset" => $offset,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
$proxy = $this->backend->get_ip();
$nsfw = $get["nsfw"];
$country = $get["country"];
$spellcheck = $get["spellcheck"] == "yes" ? "1" : "0";
/*
$handle = fopen("scraper/brave-news.html", "r");
$html = fread($handle, filesize("scraper/brave-news.html"));
fclose($handle);*/
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/news",
[
"q" => $search,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
2023-07-22 18:41:14 +00:00
}
$out = [
"status" => "ok",
"npt" => null,
"news" => []
];
// load html
$this->fuckhtml->load($html);
2023-11-07 13:04:56 +00:00
// get npt
$out["npt"] =
$this->generatenextpagetoken(
$search,
$nsfw,
$country,
$spellcheck,
"news",
$proxy
);
2023-11-09 13:50:53 +00:00
preg_match(
'/const data ?= ?(\[{.*}]);/',
$html,
$json
);
2023-07-22 18:41:14 +00:00
2023-11-09 13:50:53 +00:00
if(!isset($json[1])){
2023-07-22 18:41:14 +00:00
2023-11-09 13:50:53 +00:00
throw new Exception("Failed to grep javascript object");
}
$json = $this->fuckhtml->parseJsObject($json[1], true);
if($json === null){
2023-07-22 18:41:14 +00:00
2023-11-09 13:50:53 +00:00
throw new Exception("Failed to parse javascript object");
}
foreach(
$json[1]["data"]["body"]["response"]["news"]["results"]
as $news
){
2023-07-22 18:41:14 +00:00
if(
2023-11-09 13:50:53 +00:00
!isset($news["thumbnail"]["src"]) ||
$news["thumbnail"]["src"] == "void 0"
2023-07-22 18:41:14 +00:00
){
2023-11-09 13:50:53 +00:00
$thumb = [
"url" => null,
"ratio" => null
];
}else{
$thumb = [
"url" => $this->unshiturl($news["thumbnail"]["src"]),
2023-07-22 18:41:14 +00:00
"ratio" => "16:9"
];
}
2023-11-09 13:50:53 +00:00
$out["news"][] = [
"title" => $news["title"],
"author" => null,
"description" => $news["description"],
"date" => !isset($news["age"]) || $news["age"] == "void 0" || $news["age"] == "null" ? null : strtotime($news["age"]),
2023-11-09 13:50:53 +00:00
"thumb" => $thumb,
"url" => $news["url"]
];
2023-07-22 18:41:14 +00:00
}
return $out;
}
2023-08-08 07:09:47 +00:00
public function image($get){
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
$search = $get["s"];
2023-11-07 13:04:56 +00:00
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
2023-08-08 07:09:47 +00:00
$country = $get["country"];
$nsfw = $get["nsfw"];
2023-11-07 13:04:56 +00:00
$spellcheck = $get["spellcheck"] == "yes" ? "1" : "0";
2023-08-08 07:09:47 +00:00
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
2023-07-22 18:41:14 +00:00
try{
$html =
$this->get(
2023-11-07 13:04:56 +00:00
$this->backend->get_ip(), // no nextpage right now, pass proxy directly
2023-08-08 07:09:47 +00:00
"https://search.brave.com/images",
2023-07-22 18:41:14 +00:00
[
2023-11-07 13:04:56 +00:00
"q" => $search,
"spellcheck" => $spellcheck
2023-07-22 18:41:14 +00:00
],
$nsfw,
$country
);
}catch(Exception $error){
2023-08-08 07:09:47 +00:00
throw new Exception("Could not fetch search page");
2023-07-22 18:41:14 +00:00
}
2023-08-08 07:09:47 +00:00
/*
$handle = fopen("scraper/brave-image.html", "r");
$html = fread($handle, filesize("scraper/brave-image.html"));
fclose($handle);*/
2023-07-22 18:41:14 +00:00
preg_match(
2023-08-08 07:09:47 +00:00
'/const data = (\[{.*}\]);/',
2023-07-22 18:41:14 +00:00
$html,
2023-08-08 07:09:47 +00:00
$json
2023-07-22 18:41:14 +00:00
);
2023-08-08 07:09:47 +00:00
if(!isset($json[1])){
throw new Exception("Failed to get data object");
}
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
$json =
$this->fuckhtml
->parseJsObject(
$json[1]
);
2023-07-22 18:41:14 +00:00
foreach(
2023-08-08 07:09:47 +00:00
$json[1]
["data"]
["body"]
["response"]
["results"]
as $result
2023-07-22 18:41:14 +00:00
){
2023-08-08 07:09:47 +00:00
$out["image"][] = [
"title" => $result["title"],
"source" => [
[
"url" => $result["properties"]["url"],
"width" => null,
"height" => null
],
[
"url" => $result["thumbnail"]["src"],
"width" => null,
"height" => null
]
],
"url" => $result["url"]
];
2023-07-22 18:41:14 +00:00
}
2023-08-08 07:09:47 +00:00
return $out;
}
public function video($get){
2023-07-22 18:41:14 +00:00
2023-11-07 13:04:56 +00:00
if($get["npt"]){
[$npt, $proxy] = $this->backend->get($get["npt"], "videos");
$npt = json_decode($npt, true);
$search = $npt["q"];
$offset = $npt["offset"];
$spellcheck = $npt["spellcheck"];
$country = $npt["country"];
$nsfw = $npt["nsfw"];
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/videos",
[
"q" => $search,
"offset" => $offset,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
$spellcheck = $get["spellcheck"] == "yes" ? "1" : "0";
$proxy = $this->backend->get_ip();
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/videos",
[
"q" => $search,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
}
$this->fuckhtml->load($html);
2023-08-08 07:09:47 +00:00
$out = [
"status" => "ok",
"npt" => null,
"video" => [],
"author" => [],
"livestream" => [],
"playlist" => [],
"reel" => []
];
2023-07-22 18:41:14 +00:00
2023-11-07 13:04:56 +00:00
// get npt
$out["npt"] =
$this->generatenextpagetoken(
$search,
$nsfw,
$country,
$spellcheck,
"videos",
$proxy
);
2023-08-08 07:09:47 +00:00
/*
$handle = fopen("scraper/brave-video.html", "r");
$html = fread($handle, filesize("scraper/brave-video.html"));
fclose($handle);*/
2023-07-22 18:41:14 +00:00
preg_match(
2023-08-08 07:09:47 +00:00
'/const data = (\[{.*}\]);/',
2023-07-22 18:41:14 +00:00
$html,
2023-08-08 07:09:47 +00:00
$json
2023-07-22 18:41:14 +00:00
);
2023-08-08 07:09:47 +00:00
if(!isset($json[1])){
throw new Exception("Failed to get data object");
}
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
$json =
$this->fuckhtml
->parseJsObject(
$json[1]
2023-07-22 18:41:14 +00:00
);
2023-08-08 07:09:47 +00:00
foreach(
$json
[1]
["data"]
["body"]
["response"]
["results"]
as $result
){
if($result["video"]["author"] != "null"){
$author = [
"name" => $result["video"]["author"]["name"] == "null" ? null : $result["video"]["author"]["name"],
"url" => $result["video"]["author"]["url"] == "null" ? null : $result["video"]["author"]["url"],
"avatar" => $result["video"]["author"]["img"] == "null" ? null : $result["video"]["author"]["img"]
];
}else{
$author = [
"name" => null,
"url" => null,
"avatar" => null
];
}
if($result["thumbnail"] != "null"){
$thumb = [
"url" => $result["thumbnail"]["original"],
"ratio" => "16:9"
];
}else{
$thumb = [
"url" => null,
"ratio" => null
];
}
$out["video"][] = [
"title" => $result["title"],
"description" => $result["description"] == "null" ? null : $this->titledots($result["description"]),
"author" => $author,
2023-09-10 01:45:07 +00:00
"date" => ($result["age"] == "null" || $result["age"] == "void 0") ? null : strtotime($result["age"]),
2023-08-08 07:09:47 +00:00
"duration" => $result["video"]["duration"] == "null" ? null : $this->hms2int($result["video"]["duration"]),
"views" => $result["video"]["views"] == "null" ? null : (int)$result["video"]["views"],
"thumb" => $thumb,
"url" => $result["url"]
];
}
return $out;
}
2023-09-06 12:43:22 +00:00
private function stackoverflow_parse($html){
$i = 0;
$answer = [];
$this->fuckhtml->load($html);
foreach(
$this->fuckhtml->getElementsByTagName("*")
as $snippet
){
switch($snippet["tagName"]){
case "p":
$this->fuckhtml->load($snippet["innerHTML"]);
$codetags =
$this->fuckhtml
->getElementsByTagName("*");
$tmphtml = $snippet["innerHTML"];
foreach($codetags as $tag){
if(!isset($tag["outerHTML"])){
continue;
}
$tmphtml =
explode(
$tag["outerHTML"],
$tmphtml,
2
);
$value = $this->fuckhtml->getTextContent($tmphtml[0], false, false);
$this->appendtext($value, $answer, $i);
$type = null;
switch($tag["tagName"]){
case "code": $type = "inline_code"; break;
case "em": $type = "italic"; break;
case "blockquote": $type = "quote"; break;
default: $type = "text";
}
if($type !== null){
$value = $this->fuckhtml->getTextContent($tag, false, true);
if(trim($value) != ""){
if(
$i !== 0 &&
$type == "title"
){
$answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
}
$answer[] = [
"type" => $type,
"value" => $value
];
$i++;
}
}
if(count($tmphtml) === 2){
$tmphtml = $tmphtml[1];
}else{
break;
}
}
if(is_array($tmphtml)){
$tmphtml = $tmphtml[0];
}
if(strlen($tmphtml) !== 0){
$value = $this->fuckhtml->getTextContent($tmphtml, false, false);
$this->appendtext($value, $answer, $i);
}
break;
case "img":
$answer[] = [
"type" => "image",
"url" =>
$this->fuckhtml
->getTextContent(
$tag["attributes"]["src"]
)
];
$i++;
break;
case "pre":
switch($answer[$i - 1]["type"]){
case "text":
case "italic":
$answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
break;
}
$answer[] =
[
"type" => "code",
"value" =>
rtrim(
$this->fuckhtml
->getTextContent(
$snippet,
true,
false
)
)
];
$i++;
break;
case "ol":
$o = 0;
$this->fuckhtml->load($snippet);
$li =
$this->fuckhtml
->getElementsByTagName("li");
foreach($li as $elem){
$o++;
$this->appendtext(
$o . ". " .
$this->fuckhtml
->getTextContent(
$elem
),
$answer,
$i
);
}
break;
}
}
if(
$i !== 0 &&
$answer[$i - 1]["type"] == "text"
){
$answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
}
return $answer;
}
2023-08-08 07:09:47 +00:00
private function hms2int($time){
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
$parts = explode(":", $time, 3);
$time = 0;
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
if(count($parts) === 3){
// hours
$time = $time + ((int)$parts[0] * 3600);
array_shift($parts);
}
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
if(count($parts) === 2){
// minutes
$time = $time + ((int)$parts[0] * 60);
array_shift($parts);
}
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
// seconds
$time = $time + (int)$parts[0];
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
return $time;
}
2023-07-22 18:41:14 +00:00
private function appendtext($payload, &$text, &$index){
if(trim($payload) == ""){
return;
}
if(
$index !== 0 &&
$text[$index - 1]["type"] == "text"
){
$text[$index - 1]["value"] .= "\n\n" . preg_replace('/ $/', " ", $payload);
}else{
$text[] = [
"type" => "text",
"value" => preg_replace('/ $/', " ", $payload)
];
$index++;
}
}
private function tablesublink($html_collection, &$data){
foreach($html_collection as $html){
$html["innerHTML"] = preg_replace(
'/<style>[\S\s]*<\/style>/i',
"",
$html["innerHTML"]
);
$html =
explode(
":",
$this->fuckhtml->getTextContent($html),
2
);
if(count($html) === 1){
$html = ["Rating", $html[0]];
}
$data["table"][trim($html[0])] = trim($html[1]);
}
}
2023-11-07 13:04:56 +00:00
/*
2023-07-22 18:41:14 +00:00
private function getimagelinkfromstyle($thumb){
$thumb =
$this->fuckhtml
->getElementsByClassName(
$thumb,
"div"
);
if(count($thumb) === 0){
return [
"url" => null,
"ratio" => null
];
}
$thumb = $thumb[0]["attributes"]["style"];
preg_match(
'/background-image: ?url\((\'[^\']+\'|"[^"]+"|[^\)]+)\)/',
$thumb,
$thumb
);
$url = $this->fuckhtml->getTextContent($this->unshiturl(trim($thumb[1], '"\' ')));
if(parse_url($url, PHP_URL_HOST) == "cdn.search.brave.com"){
return [
"url" => null,
"ratio" => null
];
}
return [
"url" => $url,
"ratio" => "16:9"
];
2023-11-07 13:04:56 +00:00
}*/
2023-07-22 18:41:14 +00:00
private function limitstrlen($text){
return explode("\n", wordwrap($text, 300, "\n"))[0];
}
2023-11-07 13:04:56 +00:00
/*
2023-07-22 18:41:14 +00:00
private function limitwhitespace($text){
return
preg_replace(
'/[\s]+/',
" ",
$text
);
2023-11-07 13:04:56 +00:00
}*/
2023-07-22 18:41:14 +00:00
private function titledots($title){
$substr = substr($title, -3);
if(
$substr == "..." ||
$substr == ""
){
return trim(substr($title, 0, -3));
}
return trim($title);
}
2023-11-07 13:04:56 +00:00
private function generatenextpagetoken($q, $nsfw, $country, $spellcheck, $page, $proxy){
$nextpage =
$this->fuckhtml
->getElementsByClassName("btn", "a");
if(count($nextpage) !== 0){
$nextpage =
$nextpage[count($nextpage) - 1];
if(
strtolower(
$this->fuckhtml
->getTextContent(
$nextpage
)
) == "next"
){
preg_match(
'/offset=([0-9]+)/',
$this->fuckhtml->getTextContent($nextpage["attributes"]["href"]),
$nextpage
);
return
$this->backend->store(
json_encode(
[
"q" => $q,
"offset" => (int)$nextpage[1],
"nsfw" => $nsfw,
"country" => $country,
"spellcheck" => $spellcheck
]
),
$page,
$proxy
);
}
}
return null;
}
2023-07-22 18:41:14 +00:00
private function unshiturl($url){
// https://imgs.search.brave.com/XFnbR8Sl7ge82MBDEH7ju0UHImRovMVmQ2qnDvgNTuA/rs:fit:844:225:1/g:ce/aHR0cHM6Ly90c2U0/Lm1tLmJpbmcubmV0/L3RoP2lkPU9JUC54/UWotQXU5N2ozVndT/RDJnNG9BNVhnSGFF/SyZwaWQ9QXBp.jpeg
$tmp = explode("aHR0", $url);
if(count($tmp) !== 2){
// nothing to do
return $url;
}
return
base64_decode(
"aHR0" .
str_replace(["/", "_"], ["", "/"],
explode(
".",
$tmp[1]
)[0]
)
);
}
}