1
0
forked from lolcat/4get
This commit is contained in:
2024-04-21 19:31:56 -04:00
parent 9e18327df6
commit 130358a9e0
16 changed files with 1385 additions and 457 deletions

View File

@@ -3,78 +3,103 @@
class marginalia{
public function __construct(){
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/backend.php";
$this->backend = new backend("marginalia");
}
public function getfilters($page){
switch($page){
if(config::MARGINALIA_API_KEY === null){
case "web":
return [
"profile" => [
"display" => "Profile",
"option" => [
"any" => "Default",
"modern" => "Modern"
]
],
"format" => [
"display" => "Format",
"option" => [
"any" => "Any",
"html5" => "html5",
"xhtml" => "xhtml",
"html123" => "html123"
]
],
"file" => [
"display" => "File",
"option" => [
"any" => "Any",
"nomedia" => "Deny media",
"media" => "Contains media",
"audio" => "Contains audio",
"video" => "Contains video",
"archive" => "Contains archive",
"document" => "Contains document"
]
],
"javascript" => [
"display" => "Javascript",
"option" => [
"any" => "Allow JS",
"deny" => "Deny JS",
"require" => "Require JS"
]
],
"trackers" => [
"display" => "Trackers",
"option" => [
"any" => "Allow trackers",
"deny" => "Deny trackers",
"require" => "Require trackers"
]
],
"cookies" => [
"display" => "Cookies",
"option" => [
"any" => "Allow cookies",
"deny" => "Deny cookies",
"require" => "Require cookies"
]
],
"affiliate" => [
"display" => "Affiliate links in body",
"option" => [
"any" => "Allow affiliate links",
"deny" => "Deny affiliate links",
"require" => "Require affiliate links"
]
$base = [
"adtech" => [
"display" => "Reduce adtech",
"option" => [
"no" => "No",
"yes" => "Yes"
]
];
],
"recent" => [
"display" => "Recent results",
"option" => [
"no" => "No",
"yes" => "Yes"
]
],
"intitle" => [
"display" => "Search in title",
"option" => [
"no" => "No",
"yes" => "Yes"
]
]
];
}else{
$base = [];
}
return array_merge(
$base,
[
"format" => [
"display" => "Format",
"option" => [
"any" => "Any format",
"html5" => "html5",
"xhtml" => "xhtml",
"html123" => "html123"
]
],
"file" => [
"display" => "Filetype",
"option" => [
"any" => "Any filetype",
"nomedia" => "Deny media",
"media" => "Contains media",
"audio" => "Contains audio",
"video" => "Contains video",
"archive" => "Contains archive",
"document" => "Contains document"
]
],
"javascript" => [
"display" => "Javascript",
"option" => [
"any" => "Allow JS",
"deny" => "Deny JS",
"require" => "Require JS"
]
],
"trackers" => [
"display" => "Trackers",
"option" => [
"any" => "Allow trackers",
"deny" => "Deny trackers",
"require" => "Require trackers"
]
],
"cookies" => [
"display" => "Cookies",
"option" => [
"any" => "Allow cookies",
"deny" => "Deny cookies",
"require" => "Require cookies"
]
],
"affiliate" => [
"display" => "Affiliate links in body",
"option" => [
"any" => "Allow affiliate links",
"deny" => "Deny affiliate links",
"require" => "Require affiliate links"
]
]
]
);
}
private function get($proxy, $url, $get = []){
@@ -132,7 +157,6 @@ class marginalia{
throw new Exception("Search term is empty!");
}
$profile = $get["profile"];
$format = $get["format"];
$file = $get["file"];
@@ -180,38 +204,6 @@ class marginalia{
$search = implode(" ", $search);
$params = [
"count" => 20
];
if($profile == "modern"){
$params["index"] = 1;
}
try{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage
"https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
$params
);
}catch(Exception $error){
throw new Exception("Failed to get JSON");
}
if($json == "Slow down"){
throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
}
$json = json_decode($json, true);
/*
$handle = fopen("scraper/marginalia.json", "r");
$json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true);
fclose($handle);*/
$out = [
"status" => "ok",
"spelling" => [
@@ -228,19 +220,169 @@ class marginalia{
"related" => []
];
foreach($json["results"] as $result){
if(config::MARGINALIA_API_KEY !== null){
try{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage
"https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
[
"count" => 20
]
);
}catch(Exception $error){
throw new Exception("Failed to get JSON");
}
if($json == "Slow down"){
throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
}
$json = json_decode($json, true);
foreach($json["results"] as $result){
$out["web"][] = [
"title" => $result["title"],
"description" => str_replace("\n", " ", $result["description"]),
"url" => $result["url"],
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
// no more cloudflare!! Parse html by default
$params = [
"query" => $search
];
foreach(["adtech", "recent", "intitle"] as $v){
if($get[$v] == "yes"){
switch($v){
case "adtech": $params["adtech"] = "reduce"; break;
case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break;
}
}
}
try{
$html =
$this->get(
$this->backend->get_ip(),
"https://search.marginalia.nu/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
$this->fuckhtml->load($html);
$sections =
$this->fuckhtml
->getElementsByClassName(
"card search-result",
"section"
);
foreach($sections as $section){
$this->fuckhtml->load($section);
$title =
$this->fuckhtml
->getElementsByClassName(
"title",
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName(
"description",
"p"
);
if(count($description) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
$description = null;
}
$sublinks = [];
$sublink_html =
$this->fuckhtml
->getElementsByClassName("additional-results");
if(count($sublink_html) !== 0){
$this->fuckhtml->load($sublink_html[0]);
$links =
$this->fuckhtml
->getElementsByTagName("a");
foreach($links as $link){
$sublinks[] = [
"title" =>
$this->fuckhtml
->getTextContent(
$link
),
"date" => null,
"description" => null,
"url" =>
$this->fuckhtml
->getTextContent(
$link["attributes"]["href"]
)
];
}
}
$out["web"][] = [
"title" => $result["title"],
"description" => str_replace("\n", " ", $result["description"]),
"url" => $result["url"],
"title" =>
$this->fuckhtml
->getTextContent(
$title
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$title["attributes"]["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"sublink" => $sublinks,
"table" => []
];
}