Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

9 changed files with 42 additions and 1259 deletions

View File

@ -119,7 +119,7 @@ class config{
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages // Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
// Changing this might break things. // Changing this might break things.
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0"; const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0";
// Proxy pool assignments for each scraper // Proxy pool assignments for each scraper
// false = Use server's raw IP // false = Use server's raw IP
@ -129,7 +129,6 @@ class config{
const PROXY_BRAVE = false; const PROXY_BRAVE = false;
const PROXY_FB = false; // facebook const PROXY_FB = false; // facebook
const PROXY_GOOGLE = false; const PROXY_GOOGLE = false;
const PROXY_GOOGLE_CSE = false;
const PROXY_STARTPAGE = false; const PROXY_STARTPAGE = false;
const PROXY_QWANT = false; const PROXY_QWANT = false;
const PROXY_GHOSTERY = false; const PROXY_GHOSTERY = false;
@ -158,9 +157,6 @@ class config{
// Scraper-specific parameters // Scraper-specific parameters
// //
// GOOGLE CSE
const GOOGLE_CX_ENDPOINT = "d4e68b99b876541f0";
// MARGINALIA // MARGINALIA
// Use "null" to default out to HTML scraping OR specify a string to // Use "null" to default out to HTML scraping OR specify a string to
// use the API (Eg: "public"). API has less filters. // use the API (Eg: "public"). API has less filters.

View File

@ -939,7 +939,6 @@ class frontend{
"brave" => "Brave", "brave" => "Brave",
"yandex" => "Yandex", "yandex" => "Yandex",
"google" => "Google", "google" => "Google",
"google_cse" => "Google CSE",
"startpage" => "Startpage", "startpage" => "Startpage",
"qwant" => "Qwant", "qwant" => "Qwant",
"ghostery" => "Ghostery", "ghostery" => "Ghostery",
@ -964,7 +963,6 @@ class frontend{
"yandex" => "Yandex", "yandex" => "Yandex",
"brave" => "Brave", "brave" => "Brave",
"google" => "Google", "google" => "Google",
"google_cse" => "Google CSE",
"startpage" => "Startpage", "startpage" => "Startpage",
"qwant" => "Qwant", "qwant" => "Qwant",
"yep" => "Yep", "yep" => "Yep",

View File

@ -381,8 +381,6 @@ class fuckhtml{
$json_out = null; $json_out = null;
$last_char = null; $last_char = null;
$keyword_check = null;
for($i=0; $i<strlen($json); $i++){ for($i=0; $i<strlen($json); $i++){
switch($json[$i]){ switch($json[$i]){
@ -398,7 +396,6 @@ class fuckhtml{
$bracket = false; $bracket = false;
$is_close_bracket = true; $is_close_bracket = true;
}else{ }else{
if($bracket === false){ if($bracket === false){
@ -432,31 +429,6 @@ class fuckhtml{
$is_close_bracket === false $is_close_bracket === false
){ ){
// do keyword check
$keyword_check .= $json[$i];
if(in_array($json[$i], [":", "{"])){
$keyword_check = substr($keyword_check, 0, -1);
if(
preg_match(
'/function|array|return/i',
$keyword_check
)
){
$json_out =
preg_replace(
'/[{"]*' . preg_quote($keyword_check, "/") . '$/',
"",
$json_out
);
}
$keyword_check = null;
}
// here we know we're not iterating over a quoted string // here we know we're not iterating over a quoted string
switch($json[$i]){ switch($json[$i]){

View File

@ -293,8 +293,8 @@ class brave{
/* /*
$handle = fopen("scraper/brave.html", "r"); $handle = fopen("scraper/brave.html", "r");
$html = fread($handle, filesize("scraper/brave.html")); $html = fread($handle, filesize("scraper/brave.html"));
fclose($handle);*/ fclose($handle);
*/
try{ try{
$html = $html =
@ -410,20 +410,10 @@ class brave{
throw new Exception("Could not grep JavaScript object"); throw new Exception("Could not grep JavaScript object");
} }
$data =
rtrim(
preg_replace(
'/\(Array\(0\)\)\).*$/',
"",
$grep[1]
),
" ]"
) . "]";
$data = $data =
$this->fuckhtml $this->fuckhtml
->parseJsObject( ->parseJsObject(
$data $grep[1]
); );
unset($grep); unset($grep);
@ -673,10 +663,7 @@ class brave{
$table["Address"] = $result["location"]["postal_address"]["displayAddress"]; $table["Address"] = $result["location"]["postal_address"]["displayAddress"];
} }
if( if(isset($result["location"]["rating"])){
isset($result["location"]["rating"]) &&
$result["location"]["rating"] != "void 0"
){
$table["Rating"] = $table["Rating"] =
$result["location"]["rating"]["ratingValue"] . "/" . $result["location"]["rating"]["ratingValue"] . "/" .
@ -684,19 +671,13 @@ class brave{
number_format($result["location"]["rating"]["reviewCount"]) . " votes)"; number_format($result["location"]["rating"]["reviewCount"]) . " votes)";
} }
if( if(isset($result["location"]["contact"]["telephone"])){
isset($result["location"]["contact"]["telephone"]) &&
$result["location"]["contact"]["telephone"] != "void 0"
){
$table["Phone number"] = $table["Phone number"] =
$result["location"]["contact"]["telephone"]; $result["location"]["contact"]["telephone"];
} }
if( if(isset($result["location"]["price_range"])){
isset($result["location"]["price_range"]) &&
$result["location"]["price_range"] != "void 0"
){
$table["Price"] = $table["Price"] =
$result["location"]["price_range"]; $result["location"]["price_range"];

View File

@ -28,9 +28,6 @@ class ddg{
curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_URL, $url);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
switch($reqtype){ switch($reqtype){
case self::req_web: case self::req_web:
$headers = $headers =
@ -39,33 +36,27 @@ class ddg{
"Accept-Encoding: gzip", "Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5", "Accept-Language: en-US,en;q=0.5",
"DNT: 1", "DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive", "Connection: keep-alive",
"Upgrade-Insecure-Requests: 1", "Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document", "Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate", "Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin", "Sec-Fetch-Site: cross-site",
"Sec-Fetch-User: ?1", "Upgrade-Insecure-Requests: 1"];
"Priority: u=0, i",
"TE: trailers"];
break; break;
case self::req_xhr: case self::req_xhr:
$headers = $headers =
["User-Agent: " . config::USER_AGENT, ["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/javascript, */*; q=0.01", "Accept: */*",
"Accept-Encoding: gzip", "Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5", "Accept-Language: en-US,en;q=0.5",
"Connection: keep-alive", "Connection: keep-alive",
"Referer: https://duckduckgo.com/", "Referer: https://duckduckgo.com/",
"X-Requested-With: XMLHttpRequest", "X-Requested-With: XMLHttpRequest",
"DNT: 1", "DNT: 1",
"Sec-GPC: 1", "Sec-Fetch-Dest: script",
"Connection: keep-alive", "Sec-Fetch-Mode: no-cors",
"Sec-Fetch-Dest: empty", "Sec-Fetch-Site: same-site"];
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"TE: trailers"];
break; break;
} }
@ -1898,12 +1889,12 @@ class ddg{
[$npt, $proxy] = $this->backend->get($get["npt"], "images"); [$npt, $proxy] = $this->backend->get($get["npt"], "images");
try{ try{
$json = $this->get( $json = json_decode($this->get(
$proxy, $proxy,
"https://duckduckgo.com/i.js?" . $npt, "https://duckduckgo.com/i.js?" . $npt,
[], [],
ddg::req_xhr ddg::req_xhr
); ), true);
}catch(Exception $err){ }catch(Exception $err){
@ -1929,7 +1920,6 @@ class ddg{
$filter = []; $filter = [];
$get_filters = [ $get_filters = [
"hps" => "1",
"q" => $search, "q" => $search,
"iax" => "images", "iax" => "images",
"ia" => "images" "ia" => "images"
@ -2004,12 +1994,12 @@ class ddg{
} }
try{ try{
$json = $this->get( $json = json_decode($this->get(
$proxy, $proxy,
"https://duckduckgo.com/i.js", "https://duckduckgo.com/i.js",
$js_params, $js_params,
ddg::req_xhr ddg::req_xhr
); ), true);
}catch(Exception $err){ }catch(Exception $err){
@ -2017,13 +2007,6 @@ class ddg{
} }
} }
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$out = [ $out = [
"status" => "ok", "status" => "ok",
"npt" => null, "npt" => null,

File diff suppressed because it is too large Load Diff

View File

@ -220,7 +220,6 @@ class marginalia{
"related" => [] "related" => []
]; ];
// API scraper
if(config::MARGINALIA_API_KEY !== null){ if(config::MARGINALIA_API_KEY !== null){
try{ try{
@ -264,57 +263,34 @@ class marginalia{
return $out; return $out;
} }
// HTML parser // no more cloudflare!! Parse html by default
$proxy = $this->backend->get_ip(); $params = [
"query" => $search
];
if($get["npt"]){ foreach(["adtech", "recent", "intitle"] as $v){
[$params, $proxy] = if($get[$v] == "yes"){
$this->backend->get(
$get["npt"],
"web"
);
try{ switch($v){
$html =
$this->get(
$proxy,
"https://search.marginalia.nu/search?" . $params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML"); case "adtech": $params["adtech"] = "reduce"; break;
} case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break;
}else{
$params = [
"query" => $search
];
foreach(["adtech", "recent", "intitle"] as $v){
if($get[$v] == "yes"){
switch($v){
case "adtech": $params["adtech"] = "reduce"; break;
case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break;
}
} }
} }
}
try{ try{
$html = $html =
$this->get( $this->get(
$proxy, $this->backend->get_ip(),
"https://search.marginalia.nu/search", "https://search.marginalia.nu/search",
$params $params
); );
}catch(Exception $error){ }catch(Exception $error){
throw new Exception("Failed to get HTML"); throw new Exception("Failed to get HTML");
}
} }
$this->fuckhtml->load($html); $this->fuckhtml->load($html);
@ -411,65 +387,6 @@ class marginalia{
]; ];
} }
// get next page
$this->fuckhtml->load($html);
$pagination =
$this->fuckhtml
->getElementsByAttributeValue(
"aria-label",
"pagination",
"nav"
);
if(count($pagination) === 0){
// no pagination
return $out;
}
$this->fuckhtml->load($pagination[0]);
$pages =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$found_current_page = false;
foreach($pages as $page){
if(
stripos(
$page["attributes"]["class"],
"active"
) !== false
){
$found_current_page = true;
continue;
}
if($found_current_page){
// we found current page index, and we iterated over
// the next page <a>
$out["npt"] =
$this->backend->store(
parse_url(
$page["attributes"]["href"],
PHP_URL_QUERY
),
"web",
$proxy
);
break;
}
}
return $out; return $out;
} }
} }

View File

@ -701,11 +701,9 @@ class mojeek{
if(count($thumb) === 2){ if(count($thumb) === 2){
$answer["thumb"] = $answer["thumb"] =
urldecode( $this->fuckhtml
$this->fuckhtml ->getTextContent(
->getTextContent( $thumb[1]
$thumb[1]
)
); );
} }
} }

View File

@ -133,10 +133,6 @@ $settings = [
"value" => "google", "value" => "google",
"text" => "Google" "text" => "Google"
], ],
[
"value" => "google_cse",
"text" => "Google CSE"
],
[ [
"value" => "startpage", "value" => "startpage",
"text" => "Startpage" "text" => "Startpage"
@ -207,10 +203,6 @@ $settings = [
"value" => "google", "value" => "google",
"text" => "Google" "text" => "Google"
], ],
[
"value" => "google_cse",
"text" => "Google CSE"
],
[ [
"value" => "startpage", "value" => "startpage",
"text" => "Startpage" "text" => "Startpage"