fuckhtml = new fuckhtml(); include "lib/nextpage.php"; $this->nextpage = new nextpage("brave"); } public function getfilters($page){ switch($page){ case "web": return [ "country" => [ "display" => "Country", "option" => [ "all" => "All Regions", "ar" => "Argentina", "au" => "Australia", "at" => "Austria", "be" => "Belgium", "br" => "Brazil", "ca" => "Canada", "cl" => "Chile", "cn" => "China", "dk" => "Denmark", "fi" => "Finland", "fr" => "France", "de" => "Germany", "hk" => "Hong Kong", "in" => "India", "id" => "Indonesia", "it" => "Italy", "jp" => "Japan", "kr" => "Korea", "my" => "Malaysia", "mx" => "Mexico", "nl" => "Netherlands", "nz" => "New Zealand", "no" => "Norway", "pl" => "Poland", "pt" => "Portugal", "ph" => "Philippines", "ru" => "Russia", "sa" => "Saudi Arabia", "za" => "South Africa", "es" => "Spain", "se" => "Sweden", "ch" => "Switzerland", "tw" => "Taiwan", "tr" => "Turkey", "gb" => "United Kingdom", "us" => "United States" ] ], "nsfw" => [ "display" => "NSFW", "option" => [ "yes" => "Yes", "maybe" => "Maybe", "no" => "No" ] ], "newer" => [ "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" ], "spellcheck" => [ "display" => "Spellcheck", "option" => [ "no" => "No", "yes" => "Yes" ] ] ]; break; case "images": case "videos": case "news": return [ "country" => [ "display" => "Country", "option" => [ "all" => "All regions", "ar" => "Argentina", "au" => "Australia", "at" => "Austria", "be" => "Belgium", "br" => "Brazil", "ca" => "Canada", "cl" => "Chile", "cn" => "China", "dk" => "Denmark", "fi" => "Finland", "fr" => "France", "de" => "Germany", "hk" => "Hong Kong", "in" => "India", "id" => "Indonesia", "it" => "Italy", "jp" => "Japan", "kr" => "Korea", "my" => "Malaysia", "mx" => "Mexico", "nl" => "Netherlands", "nz" => "New Zealand", "no" => "Norway", "pl" => "Poland", "pt" => "Portugal", "ph" => "Philippines", "ru" => "Russia", "sa" => "Saudi Arabia", "za" => "South Africa", "es" => "Spain", "se" => "Sweden", "ch" => "Switzerland", "tw" => "Taiwan", "tr" => "Turkey", "gb" => "United Kingdom", "us" => "United States" ] ], "nsfw" => [ "display" => "NSFW", "option" => [ "yes" => "Yes", "maybe" => "Maybe", "no" => "No" ] ] ]; break; } } private function get($url, $get = [], $nsfw, $country){ switch($nsfw){ case "yes": $nsfw = "off"; break; case "maybe": $nsfw = "moderate"; break; case "no": $nsfw = "strict"; break; } if($country == "any"){ $country = "all"; } $headers = [ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "Cookie: safesearch={$nsfw}; country={$country}; useLocation=0; summarizer=0", "DNT: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1" ]; $curlproc = curl_init(); if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } curl_close($curlproc); return $data; } public function web($get){ if($get["npt"]){ // get next page data $q = json_decode($this->nextpage->get($get["npt"], "web"), true); $search = $q["q"]; $q["spellcheck"] = "0"; $nsfw = $q["nsfw"]; unset($q["nsfw"]); $country = $q["country"]; unset($q["country"]); }else{ // get _GET data instead $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } if(strlen($search) > 2048){ throw new Exception("Search query is too long!"); } $nsfw = $get["nsfw"]; $country = $get["country"]; $older = $get["older"]; $newer = $get["newer"]; $spellcheck = $get["spellcheck"]; $q = [ "q" => $search ]; /* Pass older/newer filters to brave */ if($newer !== false){ $newer = date("Y-m-d", $newer); if($older === false){ $older = date("Y-m-d", time()); } } if( is_string($older) === false && $older !== false ){ $older = date("Y-m-d", $older); if($newer === false){ $newer = "1970-01-02"; } } if($older !== false){ $q["tf"] = "{$newer}to{$older}"; } // spellcheck if($spellcheck == "no"){ $q["spellcheck"] = "0"; } } $handle = fopen("scraper/brave.html", "r"); $html = fread($handle, filesize("scraper/brave.html")); fclose($handle); /* try{ $html = $this->get( "https://search.brave.com/search", $q, $nsfw, $country ); }catch(Exception $error){ throw new Exception("Could not fetch search page"); } */ $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; // load html $this->fuckhtml->load($html); /* Get next page "token" */ $nextpage = $this->fuckhtml ->getElementById( "pagination", "div" ); if($nextpage){ $this->fuckhtml->load($nextpage); $nextpage = $this->fuckhtml ->getElementsByClassName("btn", "a"); if(count($nextpage) !== 0){ $nextpage = $nextpage[count($nextpage) - 1]; if( strtolower( $this->fuckhtml ->getTextContent( $nextpage ) ) == "next" ){ preg_match( '/offset=([0-9]+)/', $this->fuckhtml->getTextContent($nextpage["attributes"]["href"]), $nextpage ); $q["offset"] = (int)$nextpage[1]; $q["nsfw"] = $nsfw; $q["country"] = $country; $out["npt"] = $this->nextpage->store( json_encode($q), "web" ); } } } $this->fuckhtml->load($html); $script_disc = $this->fuckhtml ->getElementsByTagName( "script" ); $grep = []; foreach($script_disc as $discs){ preg_match( '/const data ?= ?(\[{.*}]);/', $discs["innerHTML"], $grep ); if(isset($grep[1])){ break; } } if(!isset($grep[1])){ throw new Exception("Could not get data JS"); } $data = $this->fuckhtml ->parseJsObject( $grep[1] ); unset($grep); $data = $data[1]["data"]["body"]["response"]; /* Get web results */ if(!isset($data["web"]["results"])){ return $out; } //$has_so_answer = false; foreach($data["web"]["results"] as $result){ if(isset($result["thumbnail"])){ $thumb = [ "ratio" => $result["thumbnail"]["logo"] == "false" ? "16:9" : "1:1", "url" => $result["thumbnail"]["original"] ]; }else{ $thumb = [ "ratio" => null, "url" => null ]; } $sublink = []; if(isset($result["cluster"])){ foreach($result["cluster"] as $cluster){ $sublink[] = [ "title" => $this->titledots($cluster["title"]), "description" => $this->titledots( $this->fuckhtml ->getTextContent( $cluster["description"] ) ), "url" => $cluster["url"], "date" => null ]; } } // parse table elements $table = []; // product $ref = null; if(isset($result["product"])){ $ref = &$result["product"]; }elseif(isset($result["creative_work"])){ $ref = &$result["creative_work"]; } if($ref !== null){ if(isset($ref["offers"])){ foreach($ref["offers"] as $offer){ $price = null; if(isset($offer["price"])){ if((float)$offer["price"] == 0){ $price = "Free"; }else{ $price = $offer["price"]; } } if($price !== "Free"){ if(isset($offer["priceCurrency"])){ $price .= " " . $offer["priceCurrency"]; } } if($price !== null){ $table["Price"] = trim($price); } } } if(isset($ref["rating"])){ $rating = null; if(isset($ref["rating"]["ratingValue"])){ $rating = $ref["rating"]["ratingValue"]; if(isset($ref["rating"]["bestRating"])){ $rating .= "/" . $ref["rating"]["bestRating"]; } } if(isset($ref["rating"]["reviewCount"])){ $isnull = $rating === null ? false : true; if($isnull){ $rating .= " ("; } $rating .= number_format($ref["rating"]["reviewCount"]) . " hits"; if($isnull){ $rating .= ")"; } } if($rating !== null){ $table["Rating"] = $rating; } } } // video if(isset($result["video"])){ foreach($result["video"] as $key => $value){ if(is_string($result["video"][$key]) === false){ continue; } $table[ucfirst($key)] = $value; } } /* Get StackOverflow answers */ // commented out since it also returns alot of garbage /* if( $has_so_answer === false && isset($result["qa"]) ){ $has_so_answer = true; $answer = $this->stackoverflow_parse($result["qa"]["answer"]["text"]); if(isset($result["qa"]["answer"]["author"])){ $answer[] = [ "type" => "quote", "value" => "Answer from " . $result["qa"]["answer"]["author"] ]; } $out["answer"][] = [ "title" => $this->fuckhtml ->getTextContent( $result["qa"]["question"] ), "description" => $answer, "url" => $result["url"], "thumb" => null, "table" => [], "sublink" => [] ]; }*/ $out["web"][] = [ "title" => $this->titledots( $result["title"] ), "description" => $this->titledots( $this->fuckhtml ->getTextContent( $result["description"] ) ), "url" => $result["url"], "date" => isset($result["age"]) ? strtotime($result["age"]) : null, "type" => "web", "thumb" => $thumb, "sublink" => $sublink, "table" => $table ]; } /* Get spelling autocorrect */ if( isset($data["query"]["bo_altered_diff"][0][0]) && $data["query"]["bo_altered_diff"][0][0] == "true" ){ $out["spelling"] = [ "type" => "including", "using" => $data["query"]["bo_altered_diff"][0][1], "correction" => $get["s"] ]; } /* Get wikipedia heads */ if(isset($data["infobox"]["results"][0])){ foreach($data["infobox"]["results"] as $info){ if($info["subtype"] == "code"){ $description = $this->stackoverflow_parse($info["data"]["answer"]["text"]); if(isset($info["data"]["answer"]["author"])){ $description[] = [ "type" => "quote", "value" => "Answer from " . $info["data"]["answer"]["author"] ]; } }else{ $description = []; if( isset($info["description"]) && $info["description"] != "" ){ $description[] = [ "type" => "quote", "value" => $info["description"] ]; } if( isset($info["long_desc"]) && $info["long_desc"] != "" ){ $description[] = [ "type" => "text", "value" => $this->titledots($info["long_desc"]) ]; } } $table = []; if(isset($info["attributes"])){ foreach($info["attributes"] as $row){ if( $row[1] == "null" && count($table) !== 0 ){ break; } if($row[1] == "null"){ continue; } $table[ $this->fuckhtml->getTextContent($row[0]) ] = $this->fuckhtml->getTextContent($row[1]); } } $sublink = []; if(isset($info["profiles"])){ foreach($info["profiles"] as $row){ $name = $this->fuckhtml->getTextContent($row["name"]); if(strtolower($name) == "steampowered"){ $name = "Steam"; } $sublink[ $this->fuckhtml->getTextContent($name) ] = $this->fuckhtml->getTextContent($row["url"]); } } $out["answer"][] = [ "title" => $this->fuckhtml->getTextContent($info["title"]), "description" => $description, "url" => $info["url"], "thumb" => isset($info["images"][0]["original"]) ? $info["images"][0]["original"] : null, "table" => $table, "sublink" => $sublink ]; break; // only iterate once, we get garbage most of the time } } /* Get videos */ if(isset($data["videos"]["results"])){ foreach($data["videos"]["results"] as $video){ $out["video"][] = [ "title" => $this->titledots($video["title"]), "description" => $this->titledots($video["description"]), "date" => isset($video["age"]) ? strtotime($video["age"]) : null, "duration" => isset($video["video"]["duration"]) ? $this->hms2int($video["video"]["duration"]) : null, "views" => null, "thumb" => isset($video["thumbnail"]["src"]) ? [ "ratio" => "16:9", "url" => $this->unshiturl($video["thumbnail"]["src"]) ] : [ "ratio" => null, "url" => null ], "url" => $video["url"] ]; } } /* Get news */ if(isset($data["news"]["results"])){ foreach($data["news"]["results"] as $news){ $out["news"][] = [ "title" => $this->titledots($news["title"]), "description" => $this->titledots($news["description"]), "date" => isset($news["age"]) ? strtotime($news["age"]) : null, "thumb" => isset($video["thumbnail"]["src"]) ? [ "ratio" => "16:9", "url" => $this->unshiturl($video["thumbnail"]["src"]) ] : [ "ratio" => null, "url" => null ], "url" => $news["url"] ]; } } /* Get discussions */ $disc_out = []; if(isset($data["discussions"]["results"])){ foreach($data["discussions"]["results"] as $disc){ $table = []; if(isset($disc["data"]["num_votes"])){ $table["Votes"] = (int)$disc["data"]["num_votes"]; } if(isset($disc["data"]["num_answers"])){ $table["Comments"] = (int)$disc["data"]["num_answers"]; } $disc_out[] = [ "title" => $this->titledots( $disc["title"] ), "description" => $this->limitstrlen( $this->titledots( $this->fuckhtml ->getTextContent( $disc["description"] ) ) ), "url" => $disc["url"], "date" => isset($disc["age"]) ? strtotime($disc["age"]) : null, "type" => "web", "thumb" => [ "ratio" => null, "url" => null ], "sublink" => [], "table" => $table ]; } } // append discussions at position 2 array_splice($out["web"], 1, 0, $disc_out); return $out; } public function news($get){ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $nsfw = $get["nsfw"]; $country = $get["country"]; if(strlen($search) > 2048){ throw new Exception("Search query is too long!"); } /* $handle = fopen("scraper/brave-news.html", "r"); $html = fread($handle, filesize("scraper/brave-news.html")); fclose($handle);*/ try{ $html = $this->get( "https://search.brave.com/news", [ "q" => $search ], $nsfw, $country ); }catch(Exception $error){ throw new Exception("Could not fetch search page"); } $out = [ "status" => "ok", "npt" => null, "news" => [] ]; // load html $this->fuckhtml->load($html); $news = $this->fuckhtml ->getElementsByClassName( "snippet inline gap-standard", "div" ); foreach($news as $article){ $data = [ "title" => null, "author" => null, "description" => null, "date" => null, "thumb" => [ "url" => null, "ratio" => null ], "url" => null ]; $this->fuckhtml->load($article); $elems = $this->fuckhtml ->getElementsByTagName("*"); // get title $data["title"] = $this->fuckhtml ->getTextContent( $this->fuckhtml ->getElementsByClassName( "snippet-title", $elems ) [0] ["innerHTML"] ); // get description $data["description"] = $this->titledots( $this->fuckhtml ->getTextContent( $this->fuckhtml ->getElementsByClassName( "snippet-description", $elems ) [0] ["innerHTML"] ) ); // get date $date = explode( "•", $this->fuckhtml ->getTextContent( $this->fuckhtml ->getElementsByClassName( "snippet-url", $elems )[0] ) ); if( count($date) !== 1 && trim($date[1]) != "" ){ $data["date"] = strtotime( $date[1] ); } // get URL $data["url"] = $this->fuckhtml->getTextContent( $this->unshiturl( $this->fuckhtml ->getElementsByClassName( "result-header", $elems ) [0] ["attributes"] ["href"] ) ); // get thumbnail $thumb = $this->fuckhtml ->getElementsByTagName( "img" ); if( count($thumb) === 2 && trim( $thumb[1] ["attributes"] ["src"] ) != "" ){ $data["thumb"] = [ "url" => $this->fuckhtml->getTextContent( $this->unshiturl( $thumb[1] ["attributes"] ["src"] ) ), "ratio" => "16:9" ]; } $out["news"][] = $data; } return $out; } public function image($get){ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $out = [ "status" => "ok", "npt" => null, "image" => [] ]; try{ $html = $this->get( "https://search.brave.com/images", [ "q" => $search ], $nsfw, $country ); }catch(Exception $error){ throw new Exception("Could not fetch search page"); } /* $handle = fopen("scraper/brave-image.html", "r"); $html = fread($handle, filesize("scraper/brave-image.html")); fclose($handle);*/ preg_match( '/const data = (\[{.*}\]);/', $html, $json ); if(!isset($json[1])){ throw new Exception("Failed to get data object"); } $json = $this->fuckhtml ->parseJsObject( $json[1] ); foreach( $json[1] ["data"] ["body"] ["response"] ["results"] as $result ){ $out["image"][] = [ "title" => $result["title"], "source" => [ [ "url" => $result["properties"]["url"], "width" => null, "height" => null ], [ "url" => $result["thumbnail"]["src"], "width" => null, "height" => null ] ], "url" => $result["url"] ]; } return $out; } public function video($get){ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $out = [ "status" => "ok", "npt" => null, "video" => [], "author" => [], "livestream" => [], "playlist" => [], "reel" => [] ]; try{ $html = $this->get( "https://search.brave.com/videos", [ "q" => $search ], $nsfw, $country ); }catch(Exception $error){ throw new Exception("Could not fetch search page"); } /* $handle = fopen("scraper/brave-video.html", "r"); $html = fread($handle, filesize("scraper/brave-video.html")); fclose($handle);*/ preg_match( '/const data = (\[{.*}\]);/', $html, $json ); if(!isset($json[1])){ throw new Exception("Failed to get data object"); } $json = $this->fuckhtml ->parseJsObject( $json[1] ); foreach( $json [1] ["data"] ["body"] ["response"] ["results"] as $result ){ if($result["video"]["author"] != "null"){ $author = [ "name" => $result["video"]["author"]["name"] == "null" ? null : $result["video"]["author"]["name"], "url" => $result["video"]["author"]["url"] == "null" ? null : $result["video"]["author"]["url"], "avatar" => $result["video"]["author"]["img"] == "null" ? null : $result["video"]["author"]["img"] ]; }else{ $author = [ "name" => null, "url" => null, "avatar" => null ]; } if($result["thumbnail"] != "null"){ $thumb = [ "url" => $result["thumbnail"]["original"], "ratio" => "16:9" ]; }else{ $thumb = [ "url" => null, "ratio" => null ]; } $out["video"][] = [ "title" => $result["title"], "description" => $result["description"] == "null" ? null : $this->titledots($result["description"]), "author" => $author, "date" => $result["age"] == "null" ? null : strtotime($result["age"]), "duration" => $result["video"]["duration"] == "null" ? null : $this->hms2int($result["video"]["duration"]), "views" => $result["video"]["views"] == "null" ? null : (int)$result["video"]["views"], "thumb" => $thumb, "url" => $result["url"] ]; } return $out; } private function stackoverflow_parse($html){ $i = 0; $answer = []; $this->fuckhtml->load($html); foreach( $this->fuckhtml->getElementsByTagName("*") as $snippet ){ switch($snippet["tagName"]){ case "p": $this->fuckhtml->load($snippet["innerHTML"]); $codetags = $this->fuckhtml ->getElementsByTagName("*"); $tmphtml = $snippet["innerHTML"]; foreach($codetags as $tag){ if(!isset($tag["outerHTML"])){ continue; } $tmphtml = explode( $tag["outerHTML"], $tmphtml, 2 ); $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); $this->appendtext($value, $answer, $i); $type = null; switch($tag["tagName"]){ case "code": $type = "inline_code"; break; case "em": $type = "italic"; break; case "blockquote": $type = "quote"; break; default: $type = "text"; } if($type !== null){ $value = $this->fuckhtml->getTextContent($tag, false, true); if(trim($value) != ""){ if( $i !== 0 && $type == "title" ){ $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); } $answer[] = [ "type" => $type, "value" => $value ]; $i++; } } if(count($tmphtml) === 2){ $tmphtml = $tmphtml[1]; }else{ break; } } if(is_array($tmphtml)){ $tmphtml = $tmphtml[0]; } if(strlen($tmphtml) !== 0){ $value = $this->fuckhtml->getTextContent($tmphtml, false, false); $this->appendtext($value, $answer, $i); } break; case "img": $answer[] = [ "type" => "image", "url" => $this->fuckhtml ->getTextContent( $tag["attributes"]["src"] ) ]; $i++; break; case "pre": switch($answer[$i - 1]["type"]){ case "text": case "italic": $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); break; } $answer[] = [ "type" => "code", "value" => rtrim( $this->fuckhtml ->getTextContent( $snippet, true, false ) ) ]; $i++; break; case "ol": $o = 0; $this->fuckhtml->load($snippet); $li = $this->fuckhtml ->getElementsByTagName("li"); foreach($li as $elem){ $o++; $this->appendtext( $o . ". " . $this->fuckhtml ->getTextContent( $elem ), $answer, $i ); } break; } } if( $i !== 0 && $answer[$i - 1]["type"] == "text" ){ $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); } return $answer; } private function hms2int($time){ $parts = explode(":", $time, 3); $time = 0; if(count($parts) === 3){ // hours $time = $time + ((int)$parts[0] * 3600); array_shift($parts); } if(count($parts) === 2){ // minutes $time = $time + ((int)$parts[0] * 60); array_shift($parts); } // seconds $time = $time + (int)$parts[0]; return $time; } private function appendtext($payload, &$text, &$index){ if(trim($payload) == ""){ return; } if( $index !== 0 && $text[$index - 1]["type"] == "text" ){ $text[$index - 1]["value"] .= "\n\n" . preg_replace('/ $/', " ", $payload); }else{ $text[] = [ "type" => "text", "value" => preg_replace('/ $/', " ", $payload) ]; $index++; } } private function tablesublink($html_collection, &$data){ foreach($html_collection as $html){ $html["innerHTML"] = preg_replace( '/