This commit is contained in:
2023-11-07 08:04:56 -05:00
parent 64b090ee05
commit 785452873f
59 changed files with 2592 additions and 1277 deletions

View File

@@ -7,8 +7,8 @@ class brave{
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/nextpage.php";
$this->nextpage = new nextpage("brave");
include "lib/backend.php";
$this->backend = new backend("brave");
}
public function getfilters($page){
@@ -138,13 +138,20 @@ class brave{
"maybe" => "Maybe",
"no" => "No"
]
],
"spellcheck" => [
"display" => "Spellcheck",
"option" => [
"yes" => "Yes",
"no" => "No"
]
]
];
break;
}
}
private function get($url, $get = [], $nsfw, $country){
private function get($proxy, $url, $get = [], $nsfw, $country){
switch($nsfw){
@@ -159,7 +166,7 @@ class brave{
}
$headers = [
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
"User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -190,11 +197,12 @@ class brave{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
@@ -207,7 +215,9 @@ class brave{
if($get["npt"]){
// get next page data
$q = json_decode($this->nextpage->get($get["npt"], "web"), true);
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$q = json_decode($q, true);
$search = $q["q"];
$q["spellcheck"] = "0";
@@ -222,7 +232,6 @@ class brave{
// get _GET data instead
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
@@ -230,9 +239,10 @@ class brave{
if(strlen($search) > 2048){
throw new Exception("Search query is too long!");
throw new Exception("Search term is too long!");
}
$proxy = $this->backend->get_ip();
$nsfw = $get["nsfw"];
$country = $get["country"];
$older = $get["older"];
@@ -288,6 +298,7 @@ class brave{
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/search",
$q,
$nsfw,
@@ -361,9 +372,10 @@ class brave{
$q["country"] = $country;
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
json_encode($q),
"web"
"web",
$proxy
);
}
}
@@ -759,7 +771,9 @@ class brave{
"description" =>
isset($result["review"]["description"]) ?
$this->limitstrlen(
$result["review"]["description"]
strip_tags(
$result["review"]["description"]
)
) :
$this->titledots(
$this->fuckhtml
@@ -839,6 +853,32 @@ class brave{
"value" => $this->titledots($info["long_desc"])
];
}
// parse ratings
if(
isset($info["ratings"]) &&
$info["ratings"] != "void 0"
){
$description[] = [
"type" => "title",
"value" => "Ratings"
];
foreach($info["ratings"] as $rating){
$description[] = [
"type" => "link",
"url" => $rating["profile"]["url"],
"value" => $rating["profile"]["name"]
];
$description[] = [
"type" => "text",
"value" => ": " . $rating["ratingValue"] . "/" . $rating["bestRating"] . "\n"
];
}
}
}
$table = [];
@@ -908,9 +948,9 @@ class brave{
$out["video"][] = [
"title" => $this->titledots($video["title"]),
"description" => $this->titledots($video["description"]),
"date" => isset($video["age"]) ? strtotime($video["age"]) : null,
"duration" => isset($video["video"]["duration"]) ? $this->hms2int($video["video"]["duration"]) : null,
"views" => null,
"date" => isset($video["age"]) && $video["age"] != "void 0" ? strtotime($video["age"]) : null,
"duration" => isset($video["video"]["duration"]) && $video["video"]["duration"] != "void 0" ? $this->hms2int($video["video"]["duration"]) : null,
"views" => isset($video["video"]["views"]) && $video["video"]["views"] != "void 0" ? (int)$video["video"]["views"] : null,
"thumb" =>
isset($video["thumbnail"]["src"]) ?
[
@@ -1008,37 +1048,75 @@ class brave{
public function news($get){
$search = $get["s"];
if(strlen($search) === 0){
if($get["npt"]){
throw new Exception("Search term is empty!");
}
$nsfw = $get["nsfw"];
$country = $get["country"];
if(strlen($search) > 2048){
[$req, $proxy] = $this->backend->get($get["npt"], "news");
throw new Exception("Search query is too long!");
}
/*
$handle = fopen("scraper/brave-news.html", "r");
$html = fread($handle, filesize("scraper/brave-news.html"));
fclose($handle);*/
try{
$html =
$this->get(
"https://search.brave.com/news",
[
"q" => $search
],
$nsfw,
$country
);
$req = json_decode($req, true);
}catch(Exception $error){
$search = $req["q"];
$country = $req["country"];
$nsfw = $req["nsfw"];
$offset = $req["offset"];
$spellcheck = $req["spellcheck"];
throw new Exception("Could not fetch search page");
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/news",
[
"q" => $search,
"offset" => $offset,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
$proxy = $this->backend->get_ip();
$nsfw = $get["nsfw"];
$country = $get["country"];
$spellcheck = $get["spellcheck"] == "yes" ? "1" : "0";
/*
$handle = fopen("scraper/brave-news.html", "r");
$html = fread($handle, filesize("scraper/brave-news.html"));
fclose($handle);*/
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/news",
[
"q" => $search,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
}
$out = [
@@ -1050,6 +1128,17 @@ class brave{
// load html
$this->fuckhtml->load($html);
// get npt
$out["npt"] =
$this->generatenextpagetoken(
$search,
$nsfw,
$country,
$spellcheck,
"news",
$proxy
);
$news =
$this->fuckhtml
->getElementsByClassName(
@@ -1183,8 +1272,19 @@ class brave{
public function image($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
$spellcheck = $get["spellcheck"] == "yes" ? "1" : "0";
$out = [
"status" => "ok",
@@ -1195,9 +1295,11 @@ class brave{
try{
$html =
$this->get(
$this->backend->get_ip(), // no nextpage right now, pass proxy directly
"https://search.brave.com/images",
[
"q" => $search
"q" => $search,
"spellcheck" => $spellcheck
],
$nsfw,
$country
@@ -1261,9 +1363,75 @@ class brave{
public function video($get){
$search = $get["s"];
$country = $get["country"];
$nsfw = $get["nsfw"];
if($get["npt"]){
[$npt, $proxy] = $this->backend->get($get["npt"], "videos");
$npt = json_decode($npt, true);
$search = $npt["q"];
$offset = $npt["offset"];
$spellcheck = $npt["spellcheck"];
$country = $npt["country"];
$nsfw = $npt["nsfw"];
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/videos",
[
"q" => $search,
"offset" => $offset,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
$spellcheck = $get["spellcheck"] == "yes" ? "1" : "0";
$proxy = $this->backend->get_ip();
try{
$html =
$this->get(
$proxy,
"https://search.brave.com/videos",
[
"q" => $search,
"spellcheck" => $spellcheck
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
}
$this->fuckhtml->load($html);
$out = [
"status" => "ok",
@@ -1275,21 +1443,17 @@ class brave{
"reel" => []
];
try{
$html =
$this->get(
"https://search.brave.com/videos",
[
"q" => $search
],
$nsfw,
$country
);
}catch(Exception $error){
throw new Exception("Could not fetch search page");
}
// get npt
$out["npt"] =
$this->generatenextpagetoken(
$search,
$nsfw,
$country,
$spellcheck,
"videos",
$proxy
);
/*
$handle = fopen("scraper/brave-video.html", "r");
$html = fread($handle, filesize("scraper/brave-video.html"));
@@ -1606,7 +1770,7 @@ class brave{
$data["table"][trim($html[0])] = trim($html[1]);
}
}
/*
private function getimagelinkfromstyle($thumb){
$thumb =
@@ -1646,13 +1810,13 @@ class brave{
"url" => $url,
"ratio" => "16:9"
];
}
}*/
private function limitstrlen($text){
return explode("\n", wordwrap($text, 300, "\n"))[0];
}
/*
private function limitwhitespace($text){
return
@@ -1661,7 +1825,7 @@ class brave{
" ",
$text
);
}
}*/
private function titledots($title){
@@ -1678,6 +1842,52 @@ class brave{
return trim($title);
}
private function generatenextpagetoken($q, $nsfw, $country, $spellcheck, $page, $proxy){
$nextpage =
$this->fuckhtml
->getElementsByClassName("btn", "a");
if(count($nextpage) !== 0){
$nextpage =
$nextpage[count($nextpage) - 1];
if(
strtolower(
$this->fuckhtml
->getTextContent(
$nextpage
)
) == "next"
){
preg_match(
'/offset=([0-9]+)/',
$this->fuckhtml->getTextContent($nextpage["attributes"]["href"]),
$nextpage
);
return
$this->backend->store(
json_encode(
[
"q" => $q,
"offset" => (int)$nextpage[1],
"nsfw" => $nsfw,
"country" => $country,
"spellcheck" => $spellcheck
]
),
$page,
$proxy
);
}
}
return null;
}
private function unshiturl($url){
// https://imgs.search.brave.com/XFnbR8Sl7ge82MBDEH7ju0UHImRovMVmQ2qnDvgNTuA/rs:fit:844:225:1/g:ce/aHR0cHM6Ly90c2U0/Lm1tLmJpbmcubmV0/L3RoP2lkPU9JUC54/UWotQXU5N2ozVndT/RDJnNG9BNVhnSGFF/SyZwaWQ9QXBp.jpeg

View File

@@ -4,8 +4,11 @@ class ddg{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("ddg");
include "lib/backend.php";
$this->backend = new backend("ddg");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
/*
@@ -14,7 +17,7 @@ class ddg{
private const req_web = 0;
private const req_xhr = 1;
private function get($url, $get = [], $reqtype = self::req_web){
private function get($proxy, $url, $get = [], $reqtype = self::req_web){
$curlproc = curl_init();
@@ -28,7 +31,7 @@ class ddg{
switch($reqtype){
case self::req_web:
$headers =
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5",
@@ -43,7 +46,7 @@ class ddg{
case self::req_xhr:
$headers =
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5",
@@ -57,6 +60,8 @@ class ddg{
break;
}
$this->backend->assign_proxy($curlproc, $proxy);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
@@ -69,7 +74,6 @@ class ddg{
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
@@ -541,9 +545,11 @@ class ddg{
public function web($get){
$proxy = null;
if($get["npt"]){
$jsgrep = $this->nextpage->get($get["npt"], "web");
[$jsgrep, $proxy] = $this->backend->get($get["npt"], "web");
$extendedsearch = false;
$inithtml = "";
@@ -555,6 +561,7 @@ class ddg{
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$country = $get["country"];
$nsfw = $get["nsfw"];
$older = $get["older"];
@@ -614,9 +621,9 @@ class ddg{
/*
Get html
*/
// https://duckduckgo.com/?q=minecraft&kz=1&k1=-1&kp=-2
try{
$inithtml = $this->get(
$proxy,
"https://duckduckgo.com/",
$get_filters
);
@@ -643,6 +650,7 @@ class ddg{
try{
$js = $this->get(
$proxy,
"https://links.duckduckgo.com" . $jsgrep,
[],
ddg::req_xhr
@@ -692,6 +700,7 @@ class ddg{
// get definition
$wordnikjs = $this->get(
$proxy,
"https://duckduckgo.com/js/spice/dictionary/definition/" . $wordnik,
[],
ddg::req_xhr
@@ -725,6 +734,7 @@ class ddg{
$wordnikaudio_json =
json_decode(
$this->get(
$proxy,
"https://duckduckgo.com/js/spice/dictionary/audio/" . $wordnik,
[],
ddg::req_xhr
@@ -922,6 +932,7 @@ class ddg{
try{
$stackjs = $this->get(
$proxy,
"https://duckduckgo.com" . $stack,
[],
ddg::req_xhr
@@ -944,7 +955,7 @@ class ddg{
$out["answer"][] = [
"title" => $stackjson["Heading"],
"description" => $this->htmltoarray($stackjson["Abstract"]),
"description" => $this->stackoverflow_parse($stackjson["Abstract"]),
"url" => str_replace(["http://", "ddg"], ["https://", ""], $stackjson["AbstractURL"]),
"thumb" => null,
"table" => [],
@@ -973,6 +984,7 @@ class ddg{
try{
$lyricsjs = $this->get(
$proxy,
"https://duckduckgo.com" . $lyrics,
[],
ddg::req_xhr
@@ -1166,13 +1178,13 @@ class ddg{
if(isset($answers[$i]["data"]["AbstractText"]) && !empty($answers[$i]["data"]["AbstractText"])){
$description = $this->htmltoarray($answers[$i]["data"]["AbstractText"]);
$description = $this->stackoverflow_parse($answers[$i]["data"]["AbstractText"]);
}elseif(isset($answers[$i]["data"]["Abstract"]) && !empty($answers[$i]["data"]["Abstract"])){
$description = $this->htmltoarray($answers[$i]["data"]["Abstract"]);
$description = $this->stackoverflow_parse($answers[$i]["data"]["Abstract"]);
}elseif(isset($answers[$i]["data"]["Answer"]) && !empty($answers[$i]["data"]["Answer"])){
$description = $this->htmltoarray($answers[$i]["data"]["Answer"]);
$description = $this->stackoverflow_parse($answers[$i]["data"]["Answer"]);
}else{
$description = [];
@@ -1310,6 +1322,7 @@ class ddg{
$description = [];
$shitcoinjs = $this->get(
$proxy,
"https://duckduckgo.com/js/spice/cryptocurrency/{$shitcoins[1]}/{$shitcoins[2]}/1",
[],
ddg::req_xhr
@@ -1408,6 +1421,7 @@ class ddg{
try{
$currencyjs = $this->get(
$proxy,
"https://duckduckgo.com/js/spice/currency/{$amount}/" . strtolower($currencies[1]) . "/" . strtolower($currencies[2]),
[],
ddg::req_xhr
@@ -1607,7 +1621,7 @@ class ddg{
// store next page token
if(isset($web[$i]["n"])){
$out["npt"] = $this->nextpage->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web");
$out["npt"] = $this->backend->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web", $proxy);
continue;
}
@@ -1874,10 +1888,11 @@ class ddg{
if($get["npt"]){
$npt = $this->nextpage->get($get["npt"], "images");
[$npt, $proxy] = $this->backend->get($get["npt"], "images");
try{
$json = json_decode($this->get(
$proxy,
"https://duckduckgo.com/i.js?" . $npt,
[],
ddg::req_xhr
@@ -1895,6 +1910,7 @@ class ddg{
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$country = $get["country"];
$nsfw = $get["nsfw"];
$date = $get["date"];
@@ -1934,6 +1950,7 @@ class ddg{
try{
$html = $this->get(
$proxy,
"https://duckduckgo.com",
$get_filters,
ddg::req_web
@@ -1980,6 +1997,7 @@ class ddg{
try{
$json = json_decode($this->get(
$proxy,
"https://duckduckgo.com/i.js",
$js_params,
ddg::req_xhr
@@ -2005,10 +2023,11 @@ class ddg{
}
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
explode("?", $json["next"])[1] . "&vqd=" .
$vqd,
"images"
"images",
$proxy
);
}
@@ -2046,10 +2065,11 @@ class ddg{
if($get["npt"]){
$npt = $this->nextpage->get($get["npt"], "videos");
[$npt, $proxy] = $this->backend->get($get["npt"], "videos");
try{
$json = json_decode($this->get(
$proxy,
"https://duckduckgo.com/v.js?" .
$npt,
[],
@@ -2068,6 +2088,7 @@ class ddg{
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$country = $get["country"];
$nsfw = $get["nsfw"];
$date = $get["date"];
@@ -2099,6 +2120,7 @@ class ddg{
try{
$html = $this->get(
$proxy,
"https://duckduckgo.com",
$get_filters,
ddg::req_web
@@ -2123,6 +2145,7 @@ class ddg{
try{
$json = json_decode($this->get(
$proxy,
"https://duckduckgo.com/v.js",
[
"l" => "us-en",
@@ -2155,9 +2178,10 @@ class ddg{
if(isset($json["next"])){
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
explode("?", $json["next"])[1],
"videos"
"videos",
$proxy
);
}
@@ -2213,11 +2237,12 @@ class ddg{
if($get["npt"]){
$req = $this->nextpage->get($get["npt"], "news");
[$req, $proxy] = $this->backend->get($get["npt"], "news");
try{
$json = json_decode($this->get(
$proxy,
"https://duckduckgo.com/news.js?" .
$req,
[],
@@ -2236,6 +2261,7 @@ class ddg{
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$country = $get["country"];
$nsfw = $get["nsfw"];
$date = $get["date"];
@@ -2261,6 +2287,7 @@ class ddg{
try{
$html = $this->get(
$proxy,
"https://duckduckgo.com",
$get_params,
ddg::req_web
@@ -2303,6 +2330,7 @@ class ddg{
}
$json = json_decode($this->get(
$proxy,
"https://duckduckgo.com/news.js",
$js_params,
ddg::req_xhr
@@ -2323,9 +2351,10 @@ class ddg{
if(isset($json["next"])){
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
explode("?", $json["next"])[1],
"news"
"news",
$proxy
);
}
@@ -2415,192 +2444,193 @@ class ddg{
return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]);
}
private function htmltoarray($html){
private function appendtext($payload, &$text, &$index){
$html = strip_tags($html, ["img", "pre", "code", "br", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "a"]);
if(trim($payload) == ""){
return;
}
libxml_use_internal_errors(true);
$dom = new DOMDocument("1.0", "utf-8");
$dom->loadHTML('<div>' . $html . '</div>');
$xpath = new DOMXPath($dom);
$descendants = $xpath->query('//div/node()');
if(
$index !== 0 &&
$text[$index - 1]["type"] == "text"
){
$text[$index - 1]["value"] .= preg_replace('/ $/', " ", $payload);
}else{
$text[] = [
"type" => "text",
"value" => preg_replace('/ $/', " ", $payload)
];
$index++;
}
}
private function stackoverflow_parse($html){
$images = $xpath->query('//div/node()/img');
$imageiterator = 0;
$i = 0;
$answer = [];
if(count($descendants) === 0){
$this->fuckhtml->load($html);
$tags = $this->fuckhtml->getElementsByTagName("*");
if(count($tags) === 0){
return [
"type" => "text",
"value" => $this->unescapehtml($html)
[
"type" => "text",
"value" => htmlspecialchars_decode($html)
]
];
}
$array = [];
$previoustype = null;
foreach($descendants as $node){
foreach($tags as $snippet){
// $node->nodeValue = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $node->nodeValue);
// get node type
switch($node->nodeName){
case "#text":
$type = "text";
break;
switch($snippet["tagName"]){
case "pre":
$type = "code";
break;
case "code":
$type = "inline_code";
break;
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
$type = "title";
break;
case "blockquote":
$type = "quote";
break;
case "a":
$type = "link";
case "p":
$this->fuckhtml->load($snippet["innerHTML"]);
$codetags =
$this->fuckhtml
->getElementsByTagName("*");
$tmphtml = $snippet["innerHTML"];
foreach($codetags as $tag){
if(!isset($tag["outerHTML"])){
continue;
}
$tmphtml =
explode(
$tag["outerHTML"],
$tmphtml,
2
);
$value = $this->fuckhtml->getTextContent($tmphtml[0], false, false);
$this->appendtext($value, $answer, $i);
$type = null;
switch($tag["tagName"]){
case "code": $type = "inline_code"; break;
case "em": $type = "italic"; break;
case "blockquote": $type = "quote"; break;
default: $type = "text";
}
if($type !== null){
$value = $this->fuckhtml->getTextContent($tag, false, false);
if(trim($value) != ""){
$answer[] = [
"type" => $type,
"value" => rtrim($value)
];
$i++;
}
}
if(count($tmphtml) === 2){
$tmphtml = $tmphtml[1] . "\n";
}else{
break;
}
}
if(is_array($tmphtml)){
$tmphtml = $tmphtml[0];
}
if(strlen($tmphtml) !== 0){
$value = $this->fuckhtml->getTextContent($tmphtml, true, false);
$this->appendtext($value, $answer, $i);
}
break;
case "img":
$type = "image";
break;
}
// add node to array
switch($type){
case "text":
$value = preg_replace(
'/ {2,}/',
" ",
$this->limitnewlines($this->unescapehtml($node->textContent))
);
if(
$previoustype == "quote" ||
$previoustype === null ||
$previoustype == "image" ||
$previoustype == "title" ||
$previoustype == "code"
){
$value = ltrim($value);
}
if($value == ""){
$previoustype = $type;
continue 2;
}
// merge with previous text node
if($previoustype == "text"){
$array[count($array) - 1]["value"] = trim($array[count($array) - 1]["value"]) . "\n" . $this->bstoutf8($value);
}else{
$array[] = [
"type" => "text",
"value" => $this->bstoutf8($value)
];
}
break;
case "inline_code":
case "bold":
$array[] = [
"type" => "inline_code",
"value" => $this->bstoutf8(trim($this->limitnewlines($this->unescapehtml($node->textContent))))
];
break;
case "link":
// check for link nested inside of image
if(strlen($node->childNodes->item(0)->textContent) !== 0){
$array[] = [
"type" => "link",
"value" => $this->bstoutf8(trim($this->unescapehtml($node->textContent))),
"url" => $this->bstoutf8(preg_replace('/\/ddg$/', "", preg_replace('/^http:\/\//', "https://", $this->sanitizeurl($node->getAttribute("href")))))
];
break;
}
$type = "image";
if($previoustype == "text"){
$array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]);
}
$array[] = [
$answer[] = [
"type" => "image",
"url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $images->item($imageiterator)->getAttribute("src"))))
"url" =>
$this->fuckhtml
->getTextContent(
$tag["attributes"]["src"]
)
];
$i++;
break;
case "pre":
switch($answer[$i - 1]["type"]){
case "text":
case "italic":
$answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
break;
}
$imageiterator++;
$answer[] =
[
"type" => "code",
"value" =>
rtrim(
$this->fuckhtml
->getTextContent(
$snippet,
true,
false
)
)
];
$i++;
break;
case "image":
case "ol":
$o = 0;
if($previoustype == "text"){
$array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]);
}
$this->fuckhtml->load($snippet);
$li =
$this->fuckhtml
->getElementsByTagName("li");
$array[] = [
"type" => "image",
"url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $node->getAttribute("src"))))
];
break;
case "quote":
case "title":
case "code":
if($previoustype == "text"){
foreach($li as $elem){
$o++;
$array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]);
}
// no break
default:
$value = trim($this->limitnewlines($this->unescapehtml($node->textContent)));
if($type != "code"){
$value = preg_replace(
'/ {2,}/',
" ",
$value
$this->appendtext(
$o . ". " .
$this->fuckhtml
->getTextContent(
$elem
),
$answer,
$i
);
}
$array[] = [
"type" => $type,
"value" => $this->bstoutf8($value)
];
break;
}
$previoustype = $type;
}
return $array;
if(
$i !== 0 &&
$answer[$i - 1]["type"] == "text"
){
$answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
}
return $answer;
}
private function bstoutf8($bs){

View File

@@ -9,6 +9,9 @@ class facebook{
include "lib/nextpage.php";
$this->nextpage = new nextpage("fb");
include "lib/proxy_pool.php";
$this->proxy = new proxy_pool("facebook");
}
public function getfilters($page){
@@ -104,6 +107,8 @@ class facebook{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->proxy->assign_proxy($curlproc);
$data = curl_exec($curlproc);

View File

@@ -4,8 +4,8 @@ class ftm{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("ftm");
include "lib/backend.php";
$this->backend = new backend("ftm");
}
public function getfilters($page){
@@ -13,7 +13,7 @@ class ftm{
return [];
}
private function get($url, $search, $offset){
private function get($proxy, $url, $search, $offset){
$curlproc = curl_init();
@@ -29,7 +29,7 @@ class ftm{
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -56,6 +56,8 @@ class ftm{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -70,8 +72,6 @@ class ftm{
public function image($get){
$search = $get["s"];
$out = [
"status" => "ok",
"npt" => null,
@@ -80,16 +80,28 @@ class ftm{
if($get["npt"]){
$count = (int)$this->nextpage->get($get["npt"], "images");
[$data, $proxy] = $this->backend->get($get["npt"], "images");
$data = json_decode($data, true);
$count = $data["count"];
$search = $data["search"];
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$count = 0;
$proxy = $this->backend->get_ip();
}
try{
$json =
json_decode(
$this->get(
$proxy,
"https://findthatmeme.com/api/v1/search",
$search,
$count
@@ -134,14 +146,15 @@ class ftm{
];
}
if($count === 50){
$out["npt"] =
$this->nextpage->store(
$count,
"images"
);
}
$out["npt"] =
$this->backend->store(
json_encode([
"count" => $count,
"search" => $search
]),
"images",
$proxy
);
return $out;
}

View File

@@ -10,8 +10,8 @@ class google{
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/nextpage.php";
$this->nextpage = new nextpage("google");
include "lib/backend.php";
$this->backend = new backend("google");
}
public function getfilters($page){
@@ -727,7 +727,7 @@ class google{
}
}
private function get($url, $get = []){
private function get($proxy, $url, $get = []){
$headers = [
"User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.3; pt-pt; LG-P500h-parrot Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2",
@@ -760,6 +760,8 @@ class google{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -771,7 +773,7 @@ class google{
curl_close($curlproc);
return $data;
}
/*
public function web($get){
$search = $get["s"];
@@ -877,9 +879,9 @@ class google{
if(count($title) !== 0){
/*
Container is a web link
*/
//
// Container is a web link
//
$web = [
"title" =>
$this->titledots(
@@ -1051,9 +1053,9 @@ class google{
continue;
}
/*
Parse rating object
*/
//
// Parse rating object
//
if($is_rating >= -1){
@@ -1102,9 +1104,9 @@ class google{
continue;
}
/*
Parse standalone text
*/
//
// Parse standalone text
//
$additional_info[] = $innertext;
}
}
@@ -1194,9 +1196,9 @@ class google{
$container_title == "people also search for"
){
/*
Parse related searches
*/
//
// Parse related searches
//
$as =
$this->fuckhtml
->getElementsByTagName("a");
@@ -1212,9 +1214,9 @@ class google{
continue;
}
/*
Parse image carousel
*/
//
// Parse image carousel
//
$title_container =
$this->fuckhtml
->getElementsByClassName(
@@ -1239,9 +1241,9 @@ class google{
if($title_container == "imagesview all"){
/*
Image carousel
*/
//
// Image carousel
//
$pcitem =
$this->fuckhtml
->getElementsByClassName(
@@ -1316,9 +1318,9 @@ class google{
}
}
/*
Get next page
*/
//
// Get next page
//
$as =
$this->fuckhtml
->getElementsByTagName("a");
@@ -1340,7 +1342,7 @@ class google{
}
return $out;
}
}*/
public function image($get){
@@ -1348,17 +1350,22 @@ class google{
// generate parameters
if($get["npt"]){
$params =
json_decode(
$this->nextpage->get(
$get["npt"],
"images"
),
true
[$params, $proxy] =
$this->backend->get(
$get["npt"],
"images"
);
$params = json_decode($params, true);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$country = $get["country"];
$nsfw = $get["nsfw"];
$lang = $get["lang"];
@@ -1475,6 +1482,7 @@ class google{
try{
$html =
$this->get(
$proxy,
"https://www.google.com/search",
$params
);
@@ -1578,9 +1586,10 @@ class google{
$params["ijn"] = (int)$params["ijn"] + 1;
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
json_encode($params),
"images"
"images",
$proxy
);
}else{
@@ -1628,9 +1637,10 @@ class google{
$params["imgvl"] = $imgvl;
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
json_encode($params),
"images"
"images",
$proxy
);
}
}

View File

@@ -4,11 +4,11 @@ class imgur{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("imgur");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/backend.php";
$this->backend = new backend("imgur");
}
public function getfilters($page){
@@ -57,7 +57,7 @@ class imgur{
];
}
private function get($url, $get = []){
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
@@ -70,7 +70,7 @@ class imgur{
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -89,6 +89,8 @@ class imgur{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -105,15 +107,14 @@ class imgur{
if($get["npt"]){
$filter =
json_decode(
$this->nextpage->get(
$get["npt"],
"images"
),
true
[$filter, $proxy] =
$this->backend->get(
$get["npt"],
"images"
);
$filter = json_decode($filter, true);
$search = $filter["s"];
unset($filter["s"]);
@@ -134,6 +135,12 @@ class imgur{
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$sort = $get["sort"];
$time = $get["time"];
$format = $get["format"];
@@ -165,6 +172,7 @@ class imgur{
try{
$html =
$this->get(
$proxy,
"https://imgur.com/search/$sort/$time/page/$page",
$filter
);
@@ -238,9 +246,10 @@ class imgur{
$filter["page"] = $page + 1;
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
json_encode($filter),
"images"
"images",
$proxy
);
}

View File

@@ -3,7 +3,8 @@
class marginalia{
public function __construct(){
$this->key = "public";
include "lib/backend.php";
$this->backend = new backend("marginalia");
}
public function getfilters($page){
@@ -76,10 +77,10 @@ class marginalia{
}
}
private function get($url, $get = []){
private function get($proxy, $url, $get = []){
$headers = [
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
"User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -109,6 +110,8 @@ class marginalia{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -124,6 +127,11 @@ class marginalia{
public function web($get){
$search = [$get["s"]];
if(strlen($get["s"]) === 0){
throw new Exception("Search term is empty!");
}
$profile = $get["profile"];
$format = $get["format"];
$file = $get["file"];
@@ -184,7 +192,8 @@ class marginalia{
try{
$json =
$this->get(
"https://api.marginalia.nu/{$this->key}/search/" . urlencode($search),
$this->backend->get_ip(), // no nextpage
"https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
$params
);
}catch(Exception $error){

View File

@@ -6,8 +6,8 @@ class mojeek{
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/nextpage.php";
$this->nextpage = new nextpage("mojeek");
include "lib/backend.php";
$this->backend = new backend("mojeek");
}
public function getfilters($page){
@@ -371,10 +371,10 @@ class mojeek{
}
}
private function get($url, $get = []){
private function get($proxy, $url, $get = []){
$headers = [
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
"User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -404,6 +404,8 @@ class mojeek{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -420,11 +422,12 @@ class mojeek{
if($get["npt"]){
$token = $this->nextpage->get($get["npt"], "web");
[$token, $proxy] = $this->backend->get($get["npt"], "web");
try{
$html =
$this->get(
$proxy,
"https://www.mojeek.com" . $token,
[]
);
@@ -485,9 +488,12 @@ class mojeek{
$params["si"] = $domain;
}
$proxy = $this->backend->get_ip();
try{
$html =
$this->get(
$proxy,
"https://www.mojeek.com/search",
$params
);
@@ -529,88 +535,90 @@ class mojeek{
return $out;
}
$this->fuckhtml->load($results[0]);
/*
Get search results
Get all search result divs
*/
$results =
$this->fuckhtml
->getElementsByTagName("li");
foreach($results as $result){
foreach($results as $container){
$data = [
"title" => null,
"description" => null,
"url" => null,
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
$this->fuckhtml->load($result);
$title =
$this->fuckhtml->load($container);
$results =
$this->fuckhtml
->getElementsByClassName("title", "a")[0];
->getElementsByTagName("li");
$data["title"] =
html_entity_decode(
$this->fuckhtml
->getTextContent(
$title["innerHTML"]
)
);
$data["url"] =
html_entity_decode(
$this->fuckhtml
->getTextContent(
$title["attributes"]["href"]
)
);
$description =
$this->fuckhtml
->getElementsByClassName(
"s", "p"
);
if(count($description) !== 0){
foreach($results as $result){
$data["description"] =
$this->titledots(
html_entity_decode(
$this->fuckhtml
->getTextContent(
$description[0]
)
$data = [
"title" => null,
"description" => null,
"url" => null,
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
$this->fuckhtml->load($result);
$title =
$this->fuckhtml
->getElementsByClassName("title", "a")[0];
$data["title"] =
html_entity_decode(
$this->fuckhtml
->getTextContent(
$title["innerHTML"]
)
);
}
$data["date"] =
explode(
" - ",
$this->fuckhtml
->getTextContent(
$data["url"] =
html_entity_decode(
$this->fuckhtml
->getElementsByClassName("i", "p")[1]
)
);
$data["date"] =
strtotime(
$data["date"][count($data["date"]) - 1]
);
$out["web"][] = $data;
->getTextContent(
$title["attributes"]["href"]
)
);
$description =
$this->fuckhtml
->getElementsByClassName(
"s", "p"
);
if(count($description) !== 0){
$data["description"] =
$this->titledots(
html_entity_decode(
$this->fuckhtml
->getTextContent(
$description[0]
)
)
);
}
$data["date"] =
explode(
" - ",
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName("i", "p")[1]
)
);
$data["date"] =
strtotime(
$data["date"][count($data["date"]) - 1]
);
$out["web"][] = $data;
}
}
/*
@@ -969,12 +977,13 @@ class mojeek{
if($a["innerHTML"] == "Next"){
$out["npt"] = $this->nextpage->store(
$out["npt"] = $this->backend->store(
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"web"
"web",
$proxy
);
}
}
@@ -1001,6 +1010,7 @@ class mojeek{
try{
$html =
$this->get(
$this->backend->get_ip(),
"https://www.mojeek.com/search",
[
"q" => $search,
@@ -1011,168 +1021,139 @@ class mojeek{
throw new Exception("Failed to get HTML");
}
/*
$handle = fopen("scraper/mojeek.html", "r");
$html = fread($handle, filesize("scraper/mojeek.html"));
fclose($handle);*/
/*
Get big, standard and smaller nodes
fclose($handle);
*/
foreach(
[
"results-extended",
"results-standard"
]
as $categoryname
){
$this->fuckhtml->load($html);
$articles =
$this->fuckhtml->getElementsByTagName("article");
foreach($articles as $article){
$this->fuckhtml->load($html);
$this->fuckhtml->load($article);
$categories =
$data = [
"title" => null,
"author" => null,
"description" => null,
"date" => null,
"thumb" =>
[
"url" => null,
"ratio" => null
],
"url" => null
];
$a = $this->fuckhtml->getElementsByTagName("a")[0];
$data["title"] =
$this->fuckhtml
->getElementsByClassName(
$categoryname,
"ul"
->getTextContent(
$a["attributes"]["title"]
);
foreach($categories as $category){
$this->fuckhtml->load($category);
$nodes =
$data["url"] =
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
);
$p = $this->fuckhtml->getElementsByTagName("p");
$data["description"] =
$this->titledots(
$this->fuckhtml
->getElementsByTagName("li");
foreach($nodes as $node){
$data = [
"title" => null,
"author" => null,
"description" => null,
"date" => null,
"thumb" =>
[
"url" => null,
"ratio" => null
],
"url" => null
];
/*
Parse the results
*/
$this->fuckhtml->load($node);
// get title + url
$a =
$this->fuckhtml
->getElementsByTagName("a")[0];
$data["title"] =
$this->fuckhtml
->getTextContent(
$a["attributes"]["title"]
);
$data["url"] =
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
);
// get image
$image =
$this->fuckhtml
->getElementsByTagName("img");
if(count($image) !== 0){
$data["thumb"] = [
"url" =>
urldecode(
str_replace(
"/image?img=",
"",
$this->fuckhtml
->getTextContent(
$image[0]["attributes"]["src"]
)
)
),
"ratio" => "16:9"
];
}
// get description
$description =
$this->fuckhtml
->getElementsByClassName("s", "p");
if(count($description) !== 0){
$data["description"] =
$this->titledots(
$this->fuckhtml
->getTextContent(
$description[0]
)
);
}
// get date + time
$date =
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"date",
"p"
);
$i =
$this->fuckhtml
->getElementsByClassName("i", "p");
if(count($date) !== 0){
// we're inside a big node
$data["date"] = strtotime($date[0]["innerHTML"]);
if(count($i) !== 0){
$this->fuckhtml->load($i[0]);
$a =
$this->fuckhtml
->getElementsByTagName("a");
if(count($a) !== 0){
$data["author"] =
$this->fuckhtml
->getTextContent($a[0]);
}
}
}else{
// we're inside a small node
if(count($i) !== 0){
$i =
explode(
" - ",
$this->fuckhtml
->getTextContent($i[0])
);
$data["date"] = strtotime(array_pop($i));
$data["author"] = implode(" - ", $i);
}
}
$out["news"][] = $data;
}
"s",
$p
)[0]
)
);
if($data["description"] == ""){
$data["description"] = null;
}
// get date from big node
$date =
$this->fuckhtml
->getElementsByClassName(
"date",
$p
);
if(count($date) !== 0){
$data["date"] =
strtotime(
$this->fuckhtml
->getTextContent(
$date[0]
)
);
}
// grep date + author
$s =
$this->fuckhtml
->getElementsByClassName(
"i",
$p
)[0];
$this->fuckhtml->load($s);
$a =
$this->fuckhtml
->getElementsByTagName("a");
if(count($a) !== 0){
// parse big node information
$data["author"] =
$this->fuckhtml
->getTextContent(
$a[0]["innerHTML"]
);
}else{
// parse smaller nodes
$replace =
$this->fuckhtml
->getElementsByTagName("time")[0];
$data["date"] =
strtotime(
$this->fuckhtml
->getTextContent(
$replace
)
);
$s["innerHTML"] =
str_replace(
$replace["outerHTML"],
"",
$s["innerHTML"]
);
$data["author"] =
preg_replace(
'/ &bull; $/',
"",
$s["innerHTML"]
);
}
$out["news"][] = $data;
}
return $out;

View File

@@ -6,6 +6,9 @@ class pinterest{
include "lib/nextpage.php";
$this->nextpage = new nextpage("pinterest");
include "lib/proxy_pool.php";
$this->proxy = new proxy_pool("pinterest");
}
public function getfilters($page){
@@ -44,6 +47,8 @@ class pinterest{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->proxy->assign_proxy($curlproc);
$data = curl_exec($curlproc);

View File

@@ -4,10 +4,8 @@ class sc{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("sc");
$this->client_id = "ArYppSEotE3YiXCO4Nsgid2LLqJutiww";
$this->user_id = "766585-580597-163310-929698";
include "lib/backend.php";
$this->backend = new backend("sc");
}
public function getfilters($page){
@@ -27,7 +25,7 @@ class sc{
];
}
private function get($url, $get = []){
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
@@ -40,7 +38,7 @@ class sc{
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0",
["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/javascript, */*; q=0.01",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -58,6 +56,8 @@ class sc{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -74,7 +74,7 @@ class sc{
if($get["npt"]){
$params = $this->nextpage->get($get["npt"], "music");
[$params, $proxy] = $this->backend->get($get["npt"], "music");
$params = json_decode($params, true);
$url = $params["url"];
@@ -101,7 +101,13 @@ class sc{
// https://api-v2.soundcloud.com/search/playlists_without_albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$type = $get["type"];
$proxy = $this->backend->get_ip();
switch($type){
@@ -111,8 +117,8 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "model",
"user_id" => $this->user_id,
"client_id" => $this->client_id,
"user_id" => config::SC_USER_ID,
"client_id" => config::SC_CLIENT_TOKEN,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
@@ -127,8 +133,8 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet_genre" => "",
"user_id" => $this->user_id,
"client_id" => $this->client_id,
"user_id" => config::SC_USER_ID,
"client_id" => config::SC_CLIENT_TOKEN,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
@@ -143,8 +149,8 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "place",
"user_id" => $this->user_id,
"client_id" => $this->client_id,
"user_id" => config::SC_USER_ID,
"client_id" => config::SC_CLIENT_TOKEN,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
@@ -159,8 +165,8 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "genre",
"user_id" => $this->user_id,
"client_id" => $this->client_id,
"user_id" => config::SC_USER_ID,
"client_id" => config::SC_CLIENT_TOKEN,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
@@ -175,8 +181,8 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "genre",
"user_id" => $this->user_id,
"client_id" => $this->client_id,
"user_id" => config::SC_USER_ID,
"client_id" => config::SC_CLIENT_TOKEN,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
@@ -192,8 +198,8 @@ class sc{
"variant_ids" => "",
"filter.content_tier" => "SUB_HIGH_TIER",
"facet" => "genre",
"user_id" => $this->user_id,
"client_id" => $this->client_id,
"user_id" => config::SC_USER_ID,
"client_id" => config::SC_CLIENT_TOKEN,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
@@ -206,7 +212,7 @@ class sc{
try{
$json = $this->get($url, $params);
$json = $this->get($proxy, $url, $params);
}catch(Exception $error){
@@ -244,9 +250,10 @@ class sc{
$params["url"] = $url; // we will remove this later
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
json_encode($params),
"music"
"music",
$proxy
);
}
@@ -342,7 +349,7 @@ class sc{
"endpoint" => "audio_sc",
"url" =>
$item["media"]["transcodings"][0]["url"] .
"?client_id=" . $this->client_id .
"?client_id=" . config::SC_CLIENT_TOKEN .
"&track_authorization=" .
$item["track_authorization"]
];

View File

@@ -4,8 +4,8 @@ class wiby{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("wiby");
include "lib/backend.php";
$this->backend = new backend("wiby");
}
public function getfilters($page){
@@ -36,7 +36,7 @@ class wiby{
];
}
private function get($url, $get = [], $nsfw){
private function get($proxy, $url, $get = [], $nsfw){
$curlproc = curl_init();
@@ -45,11 +45,13 @@ class wiby{
$url .= "?" . $get;
}
print_r([$proxy, $url]);
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -69,6 +71,8 @@ class wiby{
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
@@ -84,11 +88,8 @@ class wiby{
if($get["npt"]){
$q =
json_decode(
$this->nextpage->get($get["npt"], "web"),
true
);
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$q = json_decode($q, true);
$nsfw = $q["nsfw"];
unset($q["nsfw"]);
@@ -100,6 +101,7 @@ class wiby{
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$date = $get["date"];
$nsfw = $get["nsfw"] == "yes" ? "0" : "1";
@@ -150,6 +152,7 @@ class wiby{
try{
$html = $this->get(
$proxy,
"https://wiby.me/",
$q,
$nsfw
@@ -171,13 +174,14 @@ class wiby{
}else{
$nextpage =
$this->nextpage->store(
$this->backend->store(
json_encode([
"q" => $q["q"],
"p" => (int)$nextpage[1],
"nsfw" => $nsfw
]),
"web"
"web",
$proxy
);
}

View File

@@ -10,11 +10,11 @@ class yandex{
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/nextpage.php";
$this->nextpage = new nextpage("yandex");
include "lib/backend.php";
// backend included in the scraper functions
}
private function get($url, $get = [], $nsfw){
private function get($proxy, $url, $get = [], $nsfw){
$curlproc = curl_init();
@@ -32,7 +32,7 @@ class yandex{
}
$headers =
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5",
@@ -54,6 +54,8 @@ class yandex{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -207,6 +209,8 @@ class yandex{
public function web($get){
$this->backend = new backend("yandex_w");
// has captcha
// https://yandex.com/search/touch/?text=lol&app_platform=android&appsearch_header=1&ui=webmobileapp.yandex&app_version=23070603&app_id=ru.yandex.searchplugin&search_source=yandexcom_touch_native&clid=2218567
@@ -215,10 +219,11 @@ class yandex{
if($get["npt"]){
$npt = $this->nextpage->get($get["npt"], "web");
[$npt, $proxy] = $this->backend->get($get["npt"], "web");
$html =
$this->get(
$proxy,
"https://yandex.com" . $npt,
[],
"yes"
@@ -226,6 +231,12 @@ class yandex{
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$lang = $get["lang"];
$older = $get["older"];
$newer = $get["newer"];
@@ -269,6 +280,7 @@ class yandex{
try{
$html =
$this->get(
$proxy,
"https://yandex.com/search/site/",
$params,
"yes"
@@ -313,7 +325,7 @@ class yandex{
if(count($npt) !== 0){
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
$this->fuckhtml
->getTextContent(
$npt
@@ -321,7 +333,8 @@ class yandex{
["attributes"]
["href"]
),
"web"
"web",
$proxy
);
}
@@ -386,17 +399,18 @@ class yandex{
public function image($get){
$this->backend = new backend("yandex_i");
if($get["npt"]){
$request =
json_decode(
$this->nextpage->get(
$get["npt"],
"images"
),
true
[$request, $proxy] =
$this->backend->get(
$get["npt"],
"images"
);
$request = json_decode($request, true);
$nsfw = $request["nsfw"];
unset($request["nsfw"]);
}else{
@@ -407,6 +421,7 @@ class yandex{
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$nsfw = $get["nsfw"];
$time = $get["time"];
$size = $get["size"];
@@ -611,9 +626,11 @@ class yandex{
try{
$json = $this->get(
$proxy,
"https://yandex.com/images/search",
$request,
$nsfw
$nsfw,
"yandex_i"
);
}catch(Exception $err){
@@ -676,7 +693,12 @@ class yandex{
$request["p"] = 1;
}
$out["npt"] = $this->nextpage->store(json_encode($request), "images");
$out["npt"] =
$this->backend->store(
json_encode($request),
"images",
$proxy
);
}
// get search results
@@ -744,21 +766,29 @@ class yandex{
public function video($get){
$this->backend = new backend("yandex_v");
if($get["npt"]){
$params =
json_decode(
$this->nextpage->get(
$get["npt"],
"web"
),
true
[$params, $proxy] =
$this->backend->get(
$get["npt"],
"video"
);
$params = json_decode($params, true);
$nsfw = $params["nsfw"];
unset($params["nsfw"]);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$nsfw = $get["nsfw"];
$time = $get["time"];
$duration = $get["duration"];
@@ -865,9 +895,11 @@ class yandex{
try{
$json =
$this->get(
$proxy,
"https://yandex.com/video/search",
$params,
$nsfw
$nsfw,
"yandex_v"
);
}catch(Exception $error){
@@ -926,9 +958,10 @@ class yandex{
$params["p"] = "1";
$params["nsfw"] = $nsfw;
$out["npt"] =
$this->nextpage->store(
$this->backend->store(
json_encode($params),
"web"
"video",
$proxy
);
}

View File

@@ -4,8 +4,8 @@ class yep{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("yep");
include "lib/backend.php";
$this->backend = new backend("yep");
}
public function getfilters($page){
@@ -238,7 +238,7 @@ class yep{
];
}
private function get($url, $get = []){
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
@@ -251,7 +251,7 @@ class yep{
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -269,6 +269,8 @@ class yep{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -284,6 +286,11 @@ class yep{
public function image($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
@@ -305,6 +312,7 @@ class yep{
$json =
json_decode(
$this->get(
$this->backend->get_ip(), // no nextpage!
"https://api.yep.com/fs/2/search",
[
"client" => "web",

View File

@@ -8,8 +8,8 @@ class youtube{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("yt");
include "lib/backend.php";
$this->backend = new backend("yt");
}
public function getfilters($page){
@@ -340,7 +340,7 @@ class youtube{
const req_web = 0;
const req_xhr = 1;
private function get($url, $get = [], $reqtype = self::req_web, $continuation = null){
private function get($proxy, $url, $get = [], $reqtype = self::req_web, $continuation = null){
$curlproc = curl_init();
@@ -354,7 +354,7 @@ class youtube{
switch($reqtype){
case self::req_web:
$headers =
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -370,7 +370,7 @@ class youtube{
case self::req_xhr:
$headers =
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0",
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -397,6 +397,8 @@ class youtube{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -430,17 +432,17 @@ class youtube{
$json = fread($handle, filesize("nextpage.json"));
fclose($handle);*/
$npt =
json_decode(
$this->nextpage->get(
$get["npt"],
"videos"
),
true
[$npt, $proxy] =
$this->backend->get(
$get["npt"],
"videos"
);
$npt = json_decode($npt, true);
try{
$json = $this->get(
$proxy,
"https://www.youtube.com/youtubei/v1/search",
[
"key" => $npt["key"],
@@ -507,6 +509,7 @@ class youtube{
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$date = $get["date"];
$type = $get["type"];
$duration = $get["duration"];
@@ -537,6 +540,7 @@ class youtube{
try{
$json = $this->get(
$proxy,
"https://www.youtube.com/results",
$get
);
@@ -942,7 +946,14 @@ class youtube{
if($this->out["npt"] !== null){
$this->out["npt"] = $this->nextpage->store(json_encode($this->out["npt"]), "videos");
$this->out["npt"] =
$this->backend->store(
json_encode(
$this->out["npt"]
),
"videos",
$proxy
);
}
return $this->out;