2023-07-22 18:41:14 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
class marginalia{
|
|
|
|
public function __construct(){
|
|
|
|
|
2024-04-21 23:31:56 +00:00
|
|
|
include "lib/fuckhtml.php";
|
|
|
|
$this->fuckhtml = new fuckhtml();
|
|
|
|
|
2023-11-07 13:04:56 +00:00
|
|
|
include "lib/backend.php";
|
|
|
|
$this->backend = new backend("marginalia");
|
2023-07-22 18:41:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
public function getfilters($page){
|
|
|
|
|
2024-04-21 23:31:56 +00:00
|
|
|
if(config::MARGINALIA_API_KEY === null){
|
2023-07-22 18:41:14 +00:00
|
|
|
|
2024-04-21 23:31:56 +00:00
|
|
|
$base = [
|
|
|
|
"adtech" => [
|
|
|
|
"display" => "Reduce adtech",
|
|
|
|
"option" => [
|
|
|
|
"no" => "No",
|
|
|
|
"yes" => "Yes"
|
2023-07-22 18:41:14 +00:00
|
|
|
]
|
2024-04-21 23:31:56 +00:00
|
|
|
],
|
|
|
|
"recent" => [
|
|
|
|
"display" => "Recent results",
|
|
|
|
"option" => [
|
|
|
|
"no" => "No",
|
|
|
|
"yes" => "Yes"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"intitle" => [
|
|
|
|
"display" => "Search in title",
|
|
|
|
"option" => [
|
|
|
|
"no" => "No",
|
|
|
|
"yes" => "Yes"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$base = [];
|
2023-07-22 18:41:14 +00:00
|
|
|
}
|
2024-04-21 23:31:56 +00:00
|
|
|
|
|
|
|
return array_merge(
|
|
|
|
$base,
|
|
|
|
[
|
|
|
|
"format" => [
|
|
|
|
"display" => "Format",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any format",
|
|
|
|
"html5" => "html5",
|
|
|
|
"xhtml" => "xhtml",
|
|
|
|
"html123" => "html123"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"file" => [
|
|
|
|
"display" => "Filetype",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any filetype",
|
|
|
|
"nomedia" => "Deny media",
|
|
|
|
"media" => "Contains media",
|
|
|
|
"audio" => "Contains audio",
|
|
|
|
"video" => "Contains video",
|
|
|
|
"archive" => "Contains archive",
|
|
|
|
"document" => "Contains document"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"javascript" => [
|
|
|
|
"display" => "Javascript",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Allow JS",
|
|
|
|
"deny" => "Deny JS",
|
|
|
|
"require" => "Require JS"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"trackers" => [
|
|
|
|
"display" => "Trackers",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Allow trackers",
|
|
|
|
"deny" => "Deny trackers",
|
|
|
|
"require" => "Require trackers"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"cookies" => [
|
|
|
|
"display" => "Cookies",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Allow cookies",
|
|
|
|
"deny" => "Deny cookies",
|
|
|
|
"require" => "Require cookies"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"affiliate" => [
|
|
|
|
"display" => "Affiliate links in body",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Allow affiliate links",
|
|
|
|
"deny" => "Deny affiliate links",
|
|
|
|
"require" => "Require affiliate links"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
]
|
|
|
|
);
|
2023-07-22 18:41:14 +00:00
|
|
|
}
|
|
|
|
|
2023-11-07 13:04:56 +00:00
|
|
|
private function get($proxy, $url, $get = []){
|
2023-07-22 18:41:14 +00:00
|
|
|
|
|
|
|
$headers = [
|
2023-11-07 13:04:56 +00:00
|
|
|
"User-Agent: " . config::USER_AGENT,
|
2023-07-22 18:41:14 +00:00
|
|
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
|
|
"Accept-Language: en-US,en;q=0.5",
|
|
|
|
"Accept-Encoding: gzip",
|
|
|
|
"DNT: 1",
|
|
|
|
"Connection: keep-alive",
|
|
|
|
"Upgrade-Insecure-Requests: 1",
|
|
|
|
"Sec-Fetch-Dest: document",
|
|
|
|
"Sec-Fetch-Mode: navigate",
|
|
|
|
"Sec-Fetch-Site: none",
|
|
|
|
"Sec-Fetch-User: ?1"
|
|
|
|
];
|
|
|
|
|
|
|
|
$curlproc = curl_init();
|
|
|
|
|
|
|
|
if($get !== []){
|
|
|
|
$get = http_build_query($get);
|
|
|
|
$url .= "?" . $get;
|
|
|
|
}
|
|
|
|
|
|
|
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
|
|
|
|
|
|
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
|
|
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
|
|
|
|
|
|
|
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
|
|
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
|
|
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
|
|
|
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
|
|
|
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
2023-11-07 13:04:56 +00:00
|
|
|
|
|
|
|
$this->backend->assign_proxy($curlproc, $proxy);
|
2023-07-22 18:41:14 +00:00
|
|
|
|
|
|
|
$data = curl_exec($curlproc);
|
|
|
|
|
|
|
|
if(curl_errno($curlproc)){
|
|
|
|
|
|
|
|
throw new Exception(curl_error($curlproc));
|
|
|
|
}
|
|
|
|
|
|
|
|
curl_close($curlproc);
|
|
|
|
return $data;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function web($get){
|
|
|
|
|
|
|
|
$search = [$get["s"]];
|
2023-11-07 13:04:56 +00:00
|
|
|
if(strlen($get["s"]) === 0){
|
|
|
|
|
|
|
|
throw new Exception("Search term is empty!");
|
|
|
|
}
|
|
|
|
|
2023-07-22 18:41:14 +00:00
|
|
|
$format = $get["format"];
|
|
|
|
$file = $get["file"];
|
|
|
|
|
|
|
|
foreach(
|
|
|
|
[
|
|
|
|
"javascript" => $get["javascript"],
|
|
|
|
"trackers" => $get["trackers"],
|
|
|
|
"cookies" => $get["cookies"],
|
|
|
|
"affiliate" => $get["affiliate"]
|
|
|
|
]
|
|
|
|
as $key => $value
|
|
|
|
){
|
|
|
|
|
|
|
|
if($value == "any"){ continue; }
|
|
|
|
|
|
|
|
switch($key){
|
|
|
|
|
|
|
|
case "javascript": $str = "js:true"; break;
|
|
|
|
case "trackers": $str = "special:tracking"; break;
|
|
|
|
case "cookies": $str = "special:cookies"; break;
|
|
|
|
case "affiliate": $str = "special:affiliate"; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if($value == "deny"){
|
|
|
|
$str = "-" . $str;
|
|
|
|
}
|
|
|
|
|
|
|
|
$search[] = $str;
|
|
|
|
}
|
|
|
|
|
|
|
|
if($format != "any"){
|
|
|
|
|
|
|
|
$search[] = "format:$format";
|
|
|
|
}
|
|
|
|
|
|
|
|
switch($file){
|
|
|
|
|
|
|
|
case "any": break;
|
|
|
|
case "nomedia": $search[] = "-special:media"; break;
|
|
|
|
case "media": $search[] = "special:media"; break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
$search[] = "file:$file";
|
|
|
|
}
|
|
|
|
|
|
|
|
$search = implode(" ", $search);
|
|
|
|
|
|
|
|
$out = [
|
|
|
|
"status" => "ok",
|
|
|
|
"spelling" => [
|
|
|
|
"type" => "no_correction",
|
|
|
|
"using" => null,
|
|
|
|
"correction" => null
|
|
|
|
],
|
|
|
|
"npt" => null,
|
|
|
|
"answer" => [],
|
|
|
|
"web" => [],
|
|
|
|
"image" => [],
|
|
|
|
"video" => [],
|
|
|
|
"news" => [],
|
|
|
|
"related" => []
|
|
|
|
];
|
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
// API scraper
|
2024-04-21 23:31:56 +00:00
|
|
|
if(config::MARGINALIA_API_KEY !== null){
|
|
|
|
|
|
|
|
try{
|
|
|
|
$json =
|
|
|
|
$this->get(
|
|
|
|
$this->backend->get_ip(), // no nextpage
|
|
|
|
"https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
|
|
|
|
[
|
|
|
|
"count" => 20
|
|
|
|
]
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get JSON");
|
|
|
|
}
|
|
|
|
|
|
|
|
if($json == "Slow down"){
|
|
|
|
|
|
|
|
throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
|
|
|
|
}
|
|
|
|
|
|
|
|
$json = json_decode($json, true);
|
|
|
|
|
|
|
|
foreach($json["results"] as $result){
|
|
|
|
|
|
|
|
$out["web"][] = [
|
|
|
|
"title" => $result["title"],
|
|
|
|
"description" => str_replace("\n", " ", $result["description"]),
|
|
|
|
"url" => $result["url"],
|
|
|
|
"date" => null,
|
|
|
|
"type" => "web",
|
|
|
|
"thumb" => [
|
|
|
|
"url" => null,
|
|
|
|
"ratio" => null
|
|
|
|
],
|
|
|
|
"sublink" => [],
|
|
|
|
"table" => []
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
// HTML parser
|
|
|
|
$proxy = $this->backend->get_ip();
|
2024-04-21 23:31:56 +00:00
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
if($get["npt"]){
|
|
|
|
|
|
|
|
[$params, $proxy] =
|
|
|
|
$this->backend->get(
|
|
|
|
$get["npt"],
|
|
|
|
"web"
|
|
|
|
);
|
|
|
|
|
|
|
|
try{
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$proxy,
|
|
|
|
"https://search.marginalia.nu/search?" . $params
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get HTML");
|
|
|
|
}
|
|
|
|
|
|
|
|
}else{
|
|
|
|
$params = [
|
|
|
|
"query" => $search
|
|
|
|
];
|
2024-04-21 23:31:56 +00:00
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
foreach(["adtech", "recent", "intitle"] as $v){
|
2024-04-21 23:31:56 +00:00
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
if($get[$v] == "yes"){
|
2024-04-21 23:31:56 +00:00
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
switch($v){
|
|
|
|
|
|
|
|
case "adtech": $params["adtech"] = "reduce"; break;
|
|
|
|
case "recent": $params["recent"] = "recent"; break;
|
|
|
|
case "adtech": $params["searchTitle"] = "title"; break;
|
|
|
|
}
|
2024-04-21 23:31:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
try{
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$proxy,
|
|
|
|
"https://search.marginalia.nu/search",
|
|
|
|
$params
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get HTML");
|
|
|
|
}
|
2024-04-21 23:31:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
$this->fuckhtml->load($html);
|
|
|
|
|
|
|
|
$sections =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
"card search-result",
|
|
|
|
"section"
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach($sections as $section){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($section);
|
|
|
|
|
|
|
|
$title =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
"title",
|
|
|
|
"a"
|
|
|
|
)[0];
|
|
|
|
|
|
|
|
$description =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
"description",
|
|
|
|
"p"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($description) !== 0){
|
|
|
|
|
|
|
|
$description =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$description[0]
|
|
|
|
);
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$description = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
$sublinks = [];
|
|
|
|
$sublink_html =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName("additional-results");
|
|
|
|
|
|
|
|
if(count($sublink_html) !== 0){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($sublink_html[0]);
|
|
|
|
|
|
|
|
$links =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
foreach($links as $link){
|
|
|
|
|
|
|
|
$sublinks[] = [
|
|
|
|
"title" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$link
|
|
|
|
),
|
|
|
|
"date" => null,
|
|
|
|
"description" => null,
|
|
|
|
"url" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$link["attributes"]["href"]
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}
|
|
|
|
}
|
2023-07-22 18:41:14 +00:00
|
|
|
|
|
|
|
$out["web"][] = [
|
2024-04-21 23:31:56 +00:00
|
|
|
"title" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$title
|
|
|
|
),
|
|
|
|
"description" => $description,
|
|
|
|
"url" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$title["attributes"]["href"]
|
|
|
|
),
|
2023-07-22 18:41:14 +00:00
|
|
|
"date" => null,
|
|
|
|
"type" => "web",
|
|
|
|
"thumb" => [
|
|
|
|
"url" => null,
|
|
|
|
"ratio" => null
|
|
|
|
],
|
2024-04-21 23:31:56 +00:00
|
|
|
"sublink" => $sublinks,
|
2023-07-22 18:41:14 +00:00
|
|
|
"table" => []
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
2024-11-07 05:12:06 +00:00
|
|
|
// get next page
|
|
|
|
$this->fuckhtml->load($html);
|
|
|
|
|
|
|
|
$pagination =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByAttributeValue(
|
|
|
|
"aria-label",
|
|
|
|
"pagination",
|
|
|
|
"nav"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($pagination) === 0){
|
|
|
|
|
|
|
|
// no pagination
|
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->fuckhtml->load($pagination[0]);
|
|
|
|
|
|
|
|
$pages =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
"page-link",
|
|
|
|
"a"
|
|
|
|
);
|
|
|
|
|
|
|
|
$found_current_page = false;
|
|
|
|
|
|
|
|
foreach($pages as $page){
|
|
|
|
|
|
|
|
if(
|
|
|
|
stripos(
|
|
|
|
$page["attributes"]["class"],
|
|
|
|
"active"
|
|
|
|
) !== false
|
|
|
|
){
|
|
|
|
|
|
|
|
$found_current_page = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if($found_current_page){
|
|
|
|
|
|
|
|
// we found current page index, and we iterated over
|
|
|
|
// the next page <a>
|
|
|
|
|
|
|
|
$out["npt"] =
|
|
|
|
$this->backend->store(
|
|
|
|
parse_url(
|
|
|
|
$page["attributes"]["href"],
|
|
|
|
PHP_URL_QUERY
|
|
|
|
),
|
|
|
|
"web",
|
|
|
|
$proxy
|
|
|
|
);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-22 18:41:14 +00:00
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|