forked from lolcat/4get
Compare commits
17 Commits
Author | SHA1 | Date |
---|---|---|
lolcat | 9ca93f34c6 | |
lolcat | 0a43b9c849 | |
lolcat | b636fec319 | |
lolcat | 774f7113df | |
lolcat | 0b3bbe0f15 | |
lolcat | 5f0b0a7b83 | |
lolcat | 920b9d5b3f | |
lolcat | 9cd369ac08 | |
lolcat | e83865be49 | |
lolcat | 68dd7f29f6 | |
lolcat | aaa30c79f5 | |
lolcat | 070f9d442b | |
lolcat | 9c18753ec3 | |
lolcat | d8a729796e | |
lolcat | 2bbe5a29a9 | |
lolcat | 9ac195ac3b | |
lolcat | d427a48ed4 |
21
api.txt
21
api.txt
|
@ -1,10 +1,17 @@
|
|||
__ __ __
|
||||
/ // / ____ ____ / /_
|
||||
/ // /_/ __ `/ _ \/ __/
|
||||
/__ __/ /_/ / __/ /_
|
||||
/_/ \__, /\___/\__/
|
||||
/____/
|
||||
|
||||
44
|
||||
4444444 44
|
||||
44444444 44444 444
|
||||
44444444 444444 444444444
|
||||
44444 44444444 444444444
|
||||
444444444 4444444
|
||||
4444444444 444444
|
||||
4444444444444
|
||||
444444444444444444
|
||||
444444444444444
|
||||
44444444
|
||||
4444
|
||||
44
|
||||
|
||||
+ Welcome to the 4get API documentation +
|
||||
|
||||
+ Terms of use
|
||||
|
|
|
@ -119,7 +119,7 @@ class config{
|
|||
|
||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||
// Changing this might break things.
|
||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0";
|
||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||
|
||||
// Proxy pool assignments for each scraper
|
||||
// false = Use server's raw IP
|
||||
|
@ -129,6 +129,7 @@ class config{
|
|||
const PROXY_BRAVE = false;
|
||||
const PROXY_FB = false; // facebook
|
||||
const PROXY_GOOGLE = false;
|
||||
const PROXY_GOOGLE_CSE = false;
|
||||
const PROXY_STARTPAGE = false;
|
||||
const PROXY_QWANT = false;
|
||||
const PROXY_GHOSTERY = false;
|
||||
|
@ -157,6 +158,9 @@ class config{
|
|||
// Scraper-specific parameters
|
||||
//
|
||||
|
||||
// GOOGLE CSE
|
||||
const GOOGLE_CX_ENDPOINT = "d4e68b99b876541f0";
|
||||
|
||||
// MARGINALIA
|
||||
// Use "null" to default out to HTML scraping OR specify a string to
|
||||
// use the API (Eg: "public"). API has less filters.
|
||||
|
|
|
@ -75,6 +75,7 @@ class backend{
|
|||
break;
|
||||
|
||||
case "socks5_hostname":
|
||||
case "socks5h":
|
||||
case "socks5a":
|
||||
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
|
||||
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);
|
||||
|
|
|
@ -838,10 +838,10 @@ class frontend{
|
|||
}
|
||||
|
||||
$payload .=
|
||||
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
|
||||
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
|
||||
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
|
||||
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
|
||||
'<a href="https://arquivo.pt/wayback/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://arquivo.pt" alt="ar">Arquivo.pt</a>' .
|
||||
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
|
||||
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .
|
||||
'</div>';
|
||||
|
@ -939,6 +939,7 @@ class frontend{
|
|||
"brave" => "Brave",
|
||||
"yandex" => "Yandex",
|
||||
"google" => "Google",
|
||||
"google_cse" => "Google CSE",
|
||||
"startpage" => "Startpage",
|
||||
"qwant" => "Qwant",
|
||||
"ghostery" => "Ghostery",
|
||||
|
@ -963,6 +964,7 @@ class frontend{
|
|||
"yandex" => "Yandex",
|
||||
"brave" => "Brave",
|
||||
"google" => "Google",
|
||||
"google_cse" => "Google CSE",
|
||||
"startpage" => "Startpage",
|
||||
"qwant" => "Qwant",
|
||||
"yep" => "Yep",
|
||||
|
|
109
lib/fuckhtml.php
109
lib/fuckhtml.php
|
@ -381,6 +381,8 @@ class fuckhtml{
|
|||
$json_out = null;
|
||||
$last_char = null;
|
||||
|
||||
$keyword_check = null;
|
||||
|
||||
for($i=0; $i<strlen($json); $i++){
|
||||
|
||||
switch($json[$i]){
|
||||
|
@ -396,6 +398,7 @@ class fuckhtml{
|
|||
|
||||
$bracket = false;
|
||||
$is_close_bracket = true;
|
||||
|
||||
}else{
|
||||
|
||||
if($bracket === false){
|
||||
|
@ -429,6 +432,31 @@ class fuckhtml{
|
|||
$is_close_bracket === false
|
||||
){
|
||||
|
||||
// do keyword check
|
||||
$keyword_check .= $json[$i];
|
||||
|
||||
if(in_array($json[$i], [":", "{"])){
|
||||
|
||||
$keyword_check = substr($keyword_check, 0, -1);
|
||||
|
||||
if(
|
||||
preg_match(
|
||||
'/function|array|return/i',
|
||||
$keyword_check
|
||||
)
|
||||
){
|
||||
|
||||
$json_out =
|
||||
preg_replace(
|
||||
'/[{"]*' . preg_quote($keyword_check, "/") . '$/',
|
||||
"",
|
||||
$json_out
|
||||
);
|
||||
}
|
||||
|
||||
$keyword_check = null;
|
||||
}
|
||||
|
||||
// here we know we're not iterating over a quoted string
|
||||
switch($json[$i]){
|
||||
|
||||
|
@ -498,4 +526,85 @@ class fuckhtml{
|
|||
$string
|
||||
);
|
||||
}
|
||||
|
||||
public function extract_json($json){
|
||||
|
||||
$len = strlen($json);
|
||||
$array_level = 0;
|
||||
$object_level = 0;
|
||||
$in_quote = null;
|
||||
$start = null;
|
||||
|
||||
for($i=0; $i<$len; $i++){
|
||||
|
||||
switch($json[$i]){
|
||||
|
||||
case "[":
|
||||
if($in_quote === null){
|
||||
|
||||
$array_level++;
|
||||
if($start === null){
|
||||
|
||||
$start = $i;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case "]":
|
||||
if($in_quote === null){
|
||||
|
||||
$array_level--;
|
||||
}
|
||||
break;
|
||||
|
||||
case "{":
|
||||
if($in_quote === null){
|
||||
|
||||
$object_level++;
|
||||
if($start === null){
|
||||
|
||||
$start = $i;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case "}":
|
||||
if($in_quote === null){
|
||||
|
||||
$object_level--;
|
||||
}
|
||||
break;
|
||||
|
||||
case "\"":
|
||||
case "'":
|
||||
if(
|
||||
$i !== 0 &&
|
||||
$json[$i - 1] !== "\\"
|
||||
){
|
||||
// found a non-escaped quote
|
||||
|
||||
if($in_quote === null){
|
||||
|
||||
// open quote
|
||||
$in_quote = $json[$i];
|
||||
}elseif($in_quote === $json[$i]){
|
||||
|
||||
// close quote
|
||||
$in_quote = null;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if(
|
||||
$start !== null &&
|
||||
$array_level === 0 &&
|
||||
$object_level === 0
|
||||
){
|
||||
|
||||
return substr($json, $start, $i - $start + 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -293,8 +293,8 @@ class brave{
|
|||
/*
|
||||
$handle = fopen("scraper/brave.html", "r");
|
||||
$html = fread($handle, filesize("scraper/brave.html"));
|
||||
fclose($handle);
|
||||
*/
|
||||
fclose($handle);*/
|
||||
|
||||
|
||||
try{
|
||||
$html =
|
||||
|
@ -410,10 +410,20 @@ class brave{
|
|||
throw new Exception("Could not grep JavaScript object");
|
||||
}
|
||||
|
||||
$data =
|
||||
rtrim(
|
||||
preg_replace(
|
||||
'/\(Array\(0\)\)\).*$/',
|
||||
"",
|
||||
$grep[1]
|
||||
),
|
||||
" ]"
|
||||
) . "]";
|
||||
|
||||
$data =
|
||||
$this->fuckhtml
|
||||
->parseJsObject(
|
||||
$grep[1]
|
||||
$data
|
||||
);
|
||||
unset($grep);
|
||||
|
||||
|
@ -663,7 +673,10 @@ class brave{
|
|||
$table["Address"] = $result["location"]["postal_address"]["displayAddress"];
|
||||
}
|
||||
|
||||
if(isset($result["location"]["rating"])){
|
||||
if(
|
||||
isset($result["location"]["rating"]) &&
|
||||
$result["location"]["rating"] != "void 0"
|
||||
){
|
||||
|
||||
$table["Rating"] =
|
||||
$result["location"]["rating"]["ratingValue"] . "/" .
|
||||
|
@ -671,13 +684,19 @@ class brave{
|
|||
number_format($result["location"]["rating"]["reviewCount"]) . " votes)";
|
||||
}
|
||||
|
||||
if(isset($result["location"]["contact"]["telephone"])){
|
||||
if(
|
||||
isset($result["location"]["contact"]["telephone"]) &&
|
||||
$result["location"]["contact"]["telephone"] != "void 0"
|
||||
){
|
||||
|
||||
$table["Phone number"] =
|
||||
$result["location"]["contact"]["telephone"];
|
||||
}
|
||||
|
||||
if(isset($result["location"]["price_range"])){
|
||||
if(
|
||||
isset($result["location"]["price_range"]) &&
|
||||
$result["location"]["price_range"] != "void 0"
|
||||
){
|
||||
|
||||
$table["Price"] =
|
||||
$result["location"]["price_range"];
|
||||
|
|
3643
scraper/ddg.php
3643
scraper/ddg.php
File diff suppressed because it is too large
Load Diff
|
@ -136,7 +136,7 @@ class ftm{
|
|||
"source" => [
|
||||
[
|
||||
"url" =>
|
||||
"https://findthatmeme.us-southeast-1.linodeobjects.com/" .
|
||||
"https://s3.thehackerblog.com/findthatmeme/" .
|
||||
$thumb,
|
||||
"width" => null,
|
||||
"height" => null
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -220,6 +220,7 @@ class marginalia{
|
|||
"related" => []
|
||||
];
|
||||
|
||||
// API scraper
|
||||
if(config::MARGINALIA_API_KEY !== null){
|
||||
|
||||
try{
|
||||
|
@ -263,34 +264,57 @@ class marginalia{
|
|||
return $out;
|
||||
}
|
||||
|
||||
// no more cloudflare!! Parse html by default
|
||||
$params = [
|
||||
"query" => $search
|
||||
];
|
||||
// HTML parser
|
||||
$proxy = $this->backend->get_ip();
|
||||
|
||||
foreach(["adtech", "recent", "intitle"] as $v){
|
||||
if($get["npt"]){
|
||||
|
||||
if($get[$v] == "yes"){
|
||||
[$params, $proxy] =
|
||||
$this->backend->get(
|
||||
$get["npt"],
|
||||
"web"
|
||||
);
|
||||
|
||||
try{
|
||||
$html =
|
||||
$this->get(
|
||||
$proxy,
|
||||
"https://search.marginalia.nu/search?" . $params
|
||||
);
|
||||
}catch(Exception $error){
|
||||
|
||||
switch($v){
|
||||
throw new Exception("Failed to get HTML");
|
||||
}
|
||||
|
||||
}else{
|
||||
$params = [
|
||||
"query" => $search
|
||||
];
|
||||
|
||||
foreach(["adtech", "recent", "intitle"] as $v){
|
||||
|
||||
if($get[$v] == "yes"){
|
||||
|
||||
case "adtech": $params["adtech"] = "reduce"; break;
|
||||
case "recent": $params["recent"] = "recent"; break;
|
||||
case "adtech": $params["searchTitle"] = "title"; break;
|
||||
switch($v){
|
||||
|
||||
case "adtech": $params["adtech"] = "reduce"; break;
|
||||
case "recent": $params["recent"] = "recent"; break;
|
||||
case "adtech": $params["searchTitle"] = "title"; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try{
|
||||
$html =
|
||||
$this->get(
|
||||
$this->backend->get_ip(),
|
||||
"https://search.marginalia.nu/search",
|
||||
$params
|
||||
);
|
||||
}catch(Exception $error){
|
||||
|
||||
throw new Exception("Failed to get HTML");
|
||||
try{
|
||||
$html =
|
||||
$this->get(
|
||||
$proxy,
|
||||
"https://search.marginalia.nu/search",
|
||||
$params
|
||||
);
|
||||
}catch(Exception $error){
|
||||
|
||||
throw new Exception("Failed to get HTML");
|
||||
}
|
||||
}
|
||||
|
||||
$this->fuckhtml->load($html);
|
||||
|
@ -387,6 +411,65 @@ class marginalia{
|
|||
];
|
||||
}
|
||||
|
||||
// get next page
|
||||
$this->fuckhtml->load($html);
|
||||
|
||||
$pagination =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeValue(
|
||||
"aria-label",
|
||||
"pagination",
|
||||
"nav"
|
||||
);
|
||||
|
||||
if(count($pagination) === 0){
|
||||
|
||||
// no pagination
|
||||
return $out;
|
||||
}
|
||||
|
||||
$this->fuckhtml->load($pagination[0]);
|
||||
|
||||
$pages =
|
||||
$this->fuckhtml
|
||||
->getElementsByClassName(
|
||||
"page-link",
|
||||
"a"
|
||||
);
|
||||
|
||||
$found_current_page = false;
|
||||
|
||||
foreach($pages as $page){
|
||||
|
||||
if(
|
||||
stripos(
|
||||
$page["attributes"]["class"],
|
||||
"active"
|
||||
) !== false
|
||||
){
|
||||
|
||||
$found_current_page = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if($found_current_page){
|
||||
|
||||
// we found current page index, and we iterated over
|
||||
// the next page <a>
|
||||
|
||||
$out["npt"] =
|
||||
$this->backend->store(
|
||||
parse_url(
|
||||
$page["attributes"]["href"],
|
||||
PHP_URL_QUERY
|
||||
),
|
||||
"web",
|
||||
$proxy
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -701,9 +701,11 @@ class mojeek{
|
|||
if(count($thumb) === 2){
|
||||
|
||||
$answer["thumb"] =
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$thumb[1]
|
||||
urldecode(
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$thumb[1]
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -133,6 +133,10 @@ $settings = [
|
|||
"value" => "google",
|
||||
"text" => "Google"
|
||||
],
|
||||
[
|
||||
"value" => "google_cse",
|
||||
"text" => "Google CSE"
|
||||
],
|
||||
[
|
||||
"value" => "startpage",
|
||||
"text" => "Startpage"
|
||||
|
@ -203,6 +207,10 @@ $settings = [
|
|||
"value" => "google",
|
||||
"text" => "Google"
|
||||
],
|
||||
[
|
||||
"value" => "google_cse",
|
||||
"text" => "Google CSE"
|
||||
],
|
||||
[
|
||||
"value" => "startpage",
|
||||
"text" => "Startpage"
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
body{
|
||||
padding:15px 4% 40px;
|
||||
margin:unset;
|
||||
}
|
||||
|
||||
h1,h2,h3,h4,h5,h6{
|
||||
|
|
Loading…
Reference in New Issue