forked from lolcat/4get
Compare commits
11 Commits
Author | SHA1 | Date |
---|---|---|
lolcat | 920b9d5b3f | |
lolcat | 9cd369ac08 | |
lolcat | e83865be49 | |
lolcat | 68dd7f29f6 | |
lolcat | aaa30c79f5 | |
lolcat | 070f9d442b | |
lolcat | 9c18753ec3 | |
lolcat | d8a729796e | |
lolcat | 2bbe5a29a9 | |
lolcat | 9ac195ac3b | |
lolcat | d427a48ed4 |
|
@ -119,7 +119,7 @@ class config{
|
||||||
|
|
||||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||||
// Changing this might break things.
|
// Changing this might break things.
|
||||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0";
|
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0";
|
||||||
|
|
||||||
// Proxy pool assignments for each scraper
|
// Proxy pool assignments for each scraper
|
||||||
// false = Use server's raw IP
|
// false = Use server's raw IP
|
||||||
|
@ -129,6 +129,7 @@ class config{
|
||||||
const PROXY_BRAVE = false;
|
const PROXY_BRAVE = false;
|
||||||
const PROXY_FB = false; // facebook
|
const PROXY_FB = false; // facebook
|
||||||
const PROXY_GOOGLE = false;
|
const PROXY_GOOGLE = false;
|
||||||
|
const PROXY_GOOGLE_CSE = false;
|
||||||
const PROXY_STARTPAGE = false;
|
const PROXY_STARTPAGE = false;
|
||||||
const PROXY_QWANT = false;
|
const PROXY_QWANT = false;
|
||||||
const PROXY_GHOSTERY = false;
|
const PROXY_GHOSTERY = false;
|
||||||
|
@ -157,6 +158,9 @@ class config{
|
||||||
// Scraper-specific parameters
|
// Scraper-specific parameters
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// GOOGLE CSE
|
||||||
|
const GOOGLE_CX_ENDPOINT = "d4e68b99b876541f0";
|
||||||
|
|
||||||
// MARGINALIA
|
// MARGINALIA
|
||||||
// Use "null" to default out to HTML scraping OR specify a string to
|
// Use "null" to default out to HTML scraping OR specify a string to
|
||||||
// use the API (Eg: "public"). API has less filters.
|
// use the API (Eg: "public"). API has less filters.
|
||||||
|
|
|
@ -939,6 +939,7 @@ class frontend{
|
||||||
"brave" => "Brave",
|
"brave" => "Brave",
|
||||||
"yandex" => "Yandex",
|
"yandex" => "Yandex",
|
||||||
"google" => "Google",
|
"google" => "Google",
|
||||||
|
"google_cse" => "Google CSE",
|
||||||
"startpage" => "Startpage",
|
"startpage" => "Startpage",
|
||||||
"qwant" => "Qwant",
|
"qwant" => "Qwant",
|
||||||
"ghostery" => "Ghostery",
|
"ghostery" => "Ghostery",
|
||||||
|
@ -963,6 +964,7 @@ class frontend{
|
||||||
"yandex" => "Yandex",
|
"yandex" => "Yandex",
|
||||||
"brave" => "Brave",
|
"brave" => "Brave",
|
||||||
"google" => "Google",
|
"google" => "Google",
|
||||||
|
"google_cse" => "Google CSE",
|
||||||
"startpage" => "Startpage",
|
"startpage" => "Startpage",
|
||||||
"qwant" => "Qwant",
|
"qwant" => "Qwant",
|
||||||
"yep" => "Yep",
|
"yep" => "Yep",
|
||||||
|
|
|
@ -381,6 +381,8 @@ class fuckhtml{
|
||||||
$json_out = null;
|
$json_out = null;
|
||||||
$last_char = null;
|
$last_char = null;
|
||||||
|
|
||||||
|
$keyword_check = null;
|
||||||
|
|
||||||
for($i=0; $i<strlen($json); $i++){
|
for($i=0; $i<strlen($json); $i++){
|
||||||
|
|
||||||
switch($json[$i]){
|
switch($json[$i]){
|
||||||
|
@ -396,6 +398,7 @@ class fuckhtml{
|
||||||
|
|
||||||
$bracket = false;
|
$bracket = false;
|
||||||
$is_close_bracket = true;
|
$is_close_bracket = true;
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
|
|
||||||
if($bracket === false){
|
if($bracket === false){
|
||||||
|
@ -429,6 +432,31 @@ class fuckhtml{
|
||||||
$is_close_bracket === false
|
$is_close_bracket === false
|
||||||
){
|
){
|
||||||
|
|
||||||
|
// do keyword check
|
||||||
|
$keyword_check .= $json[$i];
|
||||||
|
|
||||||
|
if(in_array($json[$i], [":", "{"])){
|
||||||
|
|
||||||
|
$keyword_check = substr($keyword_check, 0, -1);
|
||||||
|
|
||||||
|
if(
|
||||||
|
preg_match(
|
||||||
|
'/function|array|return/i',
|
||||||
|
$keyword_check
|
||||||
|
)
|
||||||
|
){
|
||||||
|
|
||||||
|
$json_out =
|
||||||
|
preg_replace(
|
||||||
|
'/[{"]*' . preg_quote($keyword_check, "/") . '$/',
|
||||||
|
"",
|
||||||
|
$json_out
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
$keyword_check = null;
|
||||||
|
}
|
||||||
|
|
||||||
// here we know we're not iterating over a quoted string
|
// here we know we're not iterating over a quoted string
|
||||||
switch($json[$i]){
|
switch($json[$i]){
|
||||||
|
|
||||||
|
|
|
@ -293,8 +293,8 @@ class brave{
|
||||||
/*
|
/*
|
||||||
$handle = fopen("scraper/brave.html", "r");
|
$handle = fopen("scraper/brave.html", "r");
|
||||||
$html = fread($handle, filesize("scraper/brave.html"));
|
$html = fread($handle, filesize("scraper/brave.html"));
|
||||||
fclose($handle);
|
fclose($handle);*/
|
||||||
*/
|
|
||||||
|
|
||||||
try{
|
try{
|
||||||
$html =
|
$html =
|
||||||
|
@ -410,10 +410,20 @@ class brave{
|
||||||
throw new Exception("Could not grep JavaScript object");
|
throw new Exception("Could not grep JavaScript object");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$data =
|
||||||
|
rtrim(
|
||||||
|
preg_replace(
|
||||||
|
'/\(Array\(0\)\)\).*$/',
|
||||||
|
"",
|
||||||
|
$grep[1]
|
||||||
|
),
|
||||||
|
" ]"
|
||||||
|
) . "]";
|
||||||
|
|
||||||
$data =
|
$data =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->parseJsObject(
|
->parseJsObject(
|
||||||
$grep[1]
|
$data
|
||||||
);
|
);
|
||||||
unset($grep);
|
unset($grep);
|
||||||
|
|
||||||
|
@ -663,7 +673,10 @@ class brave{
|
||||||
$table["Address"] = $result["location"]["postal_address"]["displayAddress"];
|
$table["Address"] = $result["location"]["postal_address"]["displayAddress"];
|
||||||
}
|
}
|
||||||
|
|
||||||
if(isset($result["location"]["rating"])){
|
if(
|
||||||
|
isset($result["location"]["rating"]) &&
|
||||||
|
$result["location"]["rating"] != "void 0"
|
||||||
|
){
|
||||||
|
|
||||||
$table["Rating"] =
|
$table["Rating"] =
|
||||||
$result["location"]["rating"]["ratingValue"] . "/" .
|
$result["location"]["rating"]["ratingValue"] . "/" .
|
||||||
|
@ -671,13 +684,19 @@ class brave{
|
||||||
number_format($result["location"]["rating"]["reviewCount"]) . " votes)";
|
number_format($result["location"]["rating"]["reviewCount"]) . " votes)";
|
||||||
}
|
}
|
||||||
|
|
||||||
if(isset($result["location"]["contact"]["telephone"])){
|
if(
|
||||||
|
isset($result["location"]["contact"]["telephone"]) &&
|
||||||
|
$result["location"]["contact"]["telephone"] != "void 0"
|
||||||
|
){
|
||||||
|
|
||||||
$table["Phone number"] =
|
$table["Phone number"] =
|
||||||
$result["location"]["contact"]["telephone"];
|
$result["location"]["contact"]["telephone"];
|
||||||
}
|
}
|
||||||
|
|
||||||
if(isset($result["location"]["price_range"])){
|
if(
|
||||||
|
isset($result["location"]["price_range"]) &&
|
||||||
|
$result["location"]["price_range"] != "void 0"
|
||||||
|
){
|
||||||
|
|
||||||
$table["Price"] =
|
$table["Price"] =
|
||||||
$result["location"]["price_range"];
|
$result["location"]["price_range"];
|
||||||
|
|
|
@ -28,6 +28,9 @@ class ddg{
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
|
// http2 bypass
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
|
||||||
switch($reqtype){
|
switch($reqtype){
|
||||||
case self::req_web:
|
case self::req_web:
|
||||||
$headers =
|
$headers =
|
||||||
|
@ -36,27 +39,33 @@ class ddg{
|
||||||
"Accept-Encoding: gzip",
|
"Accept-Encoding: gzip",
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"DNT: 1",
|
"DNT: 1",
|
||||||
|
"Sec-GPC: 1",
|
||||||
"Connection: keep-alive",
|
"Connection: keep-alive",
|
||||||
"Upgrade-Insecure-Requests: 1",
|
"Upgrade-Insecure-Requests: 1",
|
||||||
"Sec-Fetch-Dest: document",
|
"Sec-Fetch-Dest: document",
|
||||||
"Sec-Fetch-Mode: navigate",
|
"Sec-Fetch-Mode: navigate",
|
||||||
"Sec-Fetch-Site: cross-site",
|
"Sec-Fetch-Site: same-origin",
|
||||||
"Upgrade-Insecure-Requests: 1"];
|
"Sec-Fetch-User: ?1",
|
||||||
|
"Priority: u=0, i",
|
||||||
|
"TE: trailers"];
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case self::req_xhr:
|
case self::req_xhr:
|
||||||
$headers =
|
$headers =
|
||||||
["User-Agent: " . config::USER_AGENT,
|
["User-Agent: " . config::USER_AGENT,
|
||||||
"Accept: */*",
|
"Accept: application/json, text/javascript, */*; q=0.01",
|
||||||
"Accept-Encoding: gzip",
|
"Accept-Encoding: gzip",
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"Connection: keep-alive",
|
"Connection: keep-alive",
|
||||||
"Referer: https://duckduckgo.com/",
|
"Referer: https://duckduckgo.com/",
|
||||||
"X-Requested-With: XMLHttpRequest",
|
"X-Requested-With: XMLHttpRequest",
|
||||||
"DNT: 1",
|
"DNT: 1",
|
||||||
"Sec-Fetch-Dest: script",
|
"Sec-GPC: 1",
|
||||||
"Sec-Fetch-Mode: no-cors",
|
"Connection: keep-alive",
|
||||||
"Sec-Fetch-Site: same-site"];
|
"Sec-Fetch-Dest: empty",
|
||||||
|
"Sec-Fetch-Mode: cors",
|
||||||
|
"Sec-Fetch-Site: same-origin",
|
||||||
|
"TE: trailers"];
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1889,12 +1898,12 @@ class ddg{
|
||||||
[$npt, $proxy] = $this->backend->get($get["npt"], "images");
|
[$npt, $proxy] = $this->backend->get($get["npt"], "images");
|
||||||
|
|
||||||
try{
|
try{
|
||||||
$json = json_decode($this->get(
|
$json = $this->get(
|
||||||
$proxy,
|
$proxy,
|
||||||
"https://duckduckgo.com/i.js?" . $npt,
|
"https://duckduckgo.com/i.js?" . $npt,
|
||||||
[],
|
[],
|
||||||
ddg::req_xhr
|
ddg::req_xhr
|
||||||
), true);
|
);
|
||||||
|
|
||||||
}catch(Exception $err){
|
}catch(Exception $err){
|
||||||
|
|
||||||
|
@ -1920,6 +1929,7 @@ class ddg{
|
||||||
|
|
||||||
$filter = [];
|
$filter = [];
|
||||||
$get_filters = [
|
$get_filters = [
|
||||||
|
"hps" => "1",
|
||||||
"q" => $search,
|
"q" => $search,
|
||||||
"iax" => "images",
|
"iax" => "images",
|
||||||
"ia" => "images"
|
"ia" => "images"
|
||||||
|
@ -1994,12 +2004,12 @@ class ddg{
|
||||||
}
|
}
|
||||||
|
|
||||||
try{
|
try{
|
||||||
$json = json_decode($this->get(
|
$json = $this->get(
|
||||||
$proxy,
|
$proxy,
|
||||||
"https://duckduckgo.com/i.js",
|
"https://duckduckgo.com/i.js",
|
||||||
$js_params,
|
$js_params,
|
||||||
ddg::req_xhr
|
ddg::req_xhr
|
||||||
), true);
|
);
|
||||||
|
|
||||||
}catch(Exception $err){
|
}catch(Exception $err){
|
||||||
|
|
||||||
|
@ -2007,6 +2017,13 @@ class ddg{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$json = json_decode($json, true);
|
||||||
|
|
||||||
|
if($json === null){
|
||||||
|
|
||||||
|
throw new Exception("Failed to decode JSON");
|
||||||
|
}
|
||||||
|
|
||||||
$out = [
|
$out = [
|
||||||
"status" => "ok",
|
"status" => "ok",
|
||||||
"npt" => null,
|
"npt" => null,
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -220,6 +220,7 @@ class marginalia{
|
||||||
"related" => []
|
"related" => []
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// API scraper
|
||||||
if(config::MARGINALIA_API_KEY !== null){
|
if(config::MARGINALIA_API_KEY !== null){
|
||||||
|
|
||||||
try{
|
try{
|
||||||
|
@ -263,34 +264,57 @@ class marginalia{
|
||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
|
|
||||||
// no more cloudflare!! Parse html by default
|
// HTML parser
|
||||||
$params = [
|
$proxy = $this->backend->get_ip();
|
||||||
"query" => $search
|
|
||||||
];
|
|
||||||
|
|
||||||
foreach(["adtech", "recent", "intitle"] as $v){
|
if($get["npt"]){
|
||||||
|
|
||||||
if($get[$v] == "yes"){
|
[$params, $proxy] =
|
||||||
|
$this->backend->get(
|
||||||
|
$get["npt"],
|
||||||
|
"web"
|
||||||
|
);
|
||||||
|
|
||||||
switch($v){
|
try{
|
||||||
|
$html =
|
||||||
|
$this->get(
|
||||||
|
$proxy,
|
||||||
|
"https://search.marginalia.nu/search?" . $params
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
case "adtech": $params["adtech"] = "reduce"; break;
|
throw new Exception("Failed to get HTML");
|
||||||
case "recent": $params["recent"] = "recent"; break;
|
}
|
||||||
case "adtech": $params["searchTitle"] = "title"; break;
|
|
||||||
|
}else{
|
||||||
|
$params = [
|
||||||
|
"query" => $search
|
||||||
|
];
|
||||||
|
|
||||||
|
foreach(["adtech", "recent", "intitle"] as $v){
|
||||||
|
|
||||||
|
if($get[$v] == "yes"){
|
||||||
|
|
||||||
|
switch($v){
|
||||||
|
|
||||||
|
case "adtech": $params["adtech"] = "reduce"; break;
|
||||||
|
case "recent": $params["recent"] = "recent"; break;
|
||||||
|
case "adtech": $params["searchTitle"] = "title"; break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
try{
|
try{
|
||||||
$html =
|
$html =
|
||||||
$this->get(
|
$this->get(
|
||||||
$this->backend->get_ip(),
|
$proxy,
|
||||||
"https://search.marginalia.nu/search",
|
"https://search.marginalia.nu/search",
|
||||||
$params
|
$params
|
||||||
);
|
);
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
throw new Exception("Failed to get HTML");
|
throw new Exception("Failed to get HTML");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->fuckhtml->load($html);
|
$this->fuckhtml->load($html);
|
||||||
|
@ -387,6 +411,65 @@ class marginalia{
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get next page
|
||||||
|
$this->fuckhtml->load($html);
|
||||||
|
|
||||||
|
$pagination =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByAttributeValue(
|
||||||
|
"aria-label",
|
||||||
|
"pagination",
|
||||||
|
"nav"
|
||||||
|
);
|
||||||
|
|
||||||
|
if(count($pagination) === 0){
|
||||||
|
|
||||||
|
// no pagination
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->fuckhtml->load($pagination[0]);
|
||||||
|
|
||||||
|
$pages =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"page-link",
|
||||||
|
"a"
|
||||||
|
);
|
||||||
|
|
||||||
|
$found_current_page = false;
|
||||||
|
|
||||||
|
foreach($pages as $page){
|
||||||
|
|
||||||
|
if(
|
||||||
|
stripos(
|
||||||
|
$page["attributes"]["class"],
|
||||||
|
"active"
|
||||||
|
) !== false
|
||||||
|
){
|
||||||
|
|
||||||
|
$found_current_page = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if($found_current_page){
|
||||||
|
|
||||||
|
// we found current page index, and we iterated over
|
||||||
|
// the next page <a>
|
||||||
|
|
||||||
|
$out["npt"] =
|
||||||
|
$this->backend->store(
|
||||||
|
parse_url(
|
||||||
|
$page["attributes"]["href"],
|
||||||
|
PHP_URL_QUERY
|
||||||
|
),
|
||||||
|
"web",
|
||||||
|
$proxy
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -701,9 +701,11 @@ class mojeek{
|
||||||
if(count($thumb) === 2){
|
if(count($thumb) === 2){
|
||||||
|
|
||||||
$answer["thumb"] =
|
$answer["thumb"] =
|
||||||
$this->fuckhtml
|
urldecode(
|
||||||
->getTextContent(
|
$this->fuckhtml
|
||||||
$thumb[1]
|
->getTextContent(
|
||||||
|
$thumb[1]
|
||||||
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -133,6 +133,10 @@ $settings = [
|
||||||
"value" => "google",
|
"value" => "google",
|
||||||
"text" => "Google"
|
"text" => "Google"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "google_cse",
|
||||||
|
"text" => "Google CSE"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "startpage",
|
"value" => "startpage",
|
||||||
"text" => "Startpage"
|
"text" => "Startpage"
|
||||||
|
@ -203,6 +207,10 @@ $settings = [
|
||||||
"value" => "google",
|
"value" => "google",
|
||||||
"text" => "Google"
|
"text" => "Google"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "google_cse",
|
||||||
|
"text" => "Google CSE"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "startpage",
|
"value" => "startpage",
|
||||||
"text" => "Startpage"
|
"text" => "Startpage"
|
||||||
|
|
Loading…
Reference in New Issue