4get/scraper/marginalia.php

477 lines
8.9 KiB
PHP
Raw Permalink Normal View History

2023-07-22 18:41:14 +00:00
<?php
class marginalia{
public function __construct(){
2024-04-21 23:31:56 +00:00
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
2023-11-07 13:04:56 +00:00
include "lib/backend.php";
$this->backend = new backend("marginalia");
2023-07-22 18:41:14 +00:00
}
public function getfilters($page){
2024-04-21 23:31:56 +00:00
if(config::MARGINALIA_API_KEY === null){
2023-07-22 18:41:14 +00:00
2024-04-21 23:31:56 +00:00
$base = [
"adtech" => [
"display" => "Reduce adtech",
"option" => [
"no" => "No",
"yes" => "Yes"
2023-07-22 18:41:14 +00:00
]
2024-04-21 23:31:56 +00:00
],
"recent" => [
"display" => "Recent results",
"option" => [
"no" => "No",
"yes" => "Yes"
]
],
"intitle" => [
"display" => "Search in title",
"option" => [
"no" => "No",
"yes" => "Yes"
]
]
];
}else{
$base = [];
2023-07-22 18:41:14 +00:00
}
2024-04-21 23:31:56 +00:00
return array_merge(
$base,
[
"format" => [
"display" => "Format",
"option" => [
"any" => "Any format",
"html5" => "html5",
"xhtml" => "xhtml",
"html123" => "html123"
]
],
"file" => [
"display" => "Filetype",
"option" => [
"any" => "Any filetype",
"nomedia" => "Deny media",
"media" => "Contains media",
"audio" => "Contains audio",
"video" => "Contains video",
"archive" => "Contains archive",
"document" => "Contains document"
]
],
"javascript" => [
"display" => "Javascript",
"option" => [
"any" => "Allow JS",
"deny" => "Deny JS",
"require" => "Require JS"
]
],
"trackers" => [
"display" => "Trackers",
"option" => [
"any" => "Allow trackers",
"deny" => "Deny trackers",
"require" => "Require trackers"
]
],
"cookies" => [
"display" => "Cookies",
"option" => [
"any" => "Allow cookies",
"deny" => "Deny cookies",
"require" => "Require cookies"
]
],
"affiliate" => [
"display" => "Affiliate links in body",
"option" => [
"any" => "Allow affiliate links",
"deny" => "Deny affiliate links",
"require" => "Require affiliate links"
]
]
]
);
2023-07-22 18:41:14 +00:00
}
2023-11-07 13:04:56 +00:00
private function get($proxy, $url, $get = []){
2023-07-22 18:41:14 +00:00
$headers = [
2023-11-07 13:04:56 +00:00
"User-Agent: " . config::USER_AGENT,
2023-07-22 18:41:14 +00:00
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"
];
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
2023-11-07 13:04:56 +00:00
$this->backend->assign_proxy($curlproc, $proxy);
2023-07-22 18:41:14 +00:00
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
$search = [$get["s"]];
2023-11-07 13:04:56 +00:00
if(strlen($get["s"]) === 0){
throw new Exception("Search term is empty!");
}
2023-07-22 18:41:14 +00:00
$format = $get["format"];
$file = $get["file"];
foreach(
[
"javascript" => $get["javascript"],
"trackers" => $get["trackers"],
"cookies" => $get["cookies"],
"affiliate" => $get["affiliate"]
]
as $key => $value
){
if($value == "any"){ continue; }
switch($key){
case "javascript": $str = "js:true"; break;
case "trackers": $str = "special:tracking"; break;
case "cookies": $str = "special:cookies"; break;
case "affiliate": $str = "special:affiliate"; break;
}
if($value == "deny"){
$str = "-" . $str;
}
$search[] = $str;
}
if($format != "any"){
$search[] = "format:$format";
}
switch($file){
case "any": break;
case "nomedia": $search[] = "-special:media"; break;
case "media": $search[] = "special:media"; break;
default:
$search[] = "file:$file";
}
$search = implode(" ", $search);
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
2024-11-07 05:12:06 +00:00
// API scraper
2024-04-21 23:31:56 +00:00
if(config::MARGINALIA_API_KEY !== null){
try{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage
2025-01-08 02:12:07 +00:00
"https://api.marginalia-search.com/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
2024-04-21 23:31:56 +00:00
[
"count" => 20
]
);
}catch(Exception $error){
throw new Exception("Failed to get JSON");
}
if($json == "Slow down"){
throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
}
$json = json_decode($json, true);
foreach($json["results"] as $result){
$out["web"][] = [
"title" => $result["title"],
"description" => str_replace("\n", " ", $result["description"]),
"url" => $result["url"],
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
2024-11-07 05:12:06 +00:00
// HTML parser
$proxy = $this->backend->get_ip();
2024-04-21 23:31:56 +00:00
2024-11-07 05:12:06 +00:00
if($get["npt"]){
[$params, $proxy] =
$this->backend->get(
$get["npt"],
"web"
);
try{
$html =
$this->get(
$proxy,
2025-01-08 02:12:07 +00:00
"https://old-search.marginalia.nu/search?" . $params
2024-11-07 05:12:06 +00:00
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
}else{
$params = [
"query" => $search
];
2024-04-21 23:31:56 +00:00
2024-11-07 05:12:06 +00:00
foreach(["adtech", "recent", "intitle"] as $v){
2024-04-21 23:31:56 +00:00
2024-11-07 05:12:06 +00:00
if($get[$v] == "yes"){
2024-04-21 23:31:56 +00:00
2024-11-07 05:12:06 +00:00
switch($v){
case "adtech": $params["adtech"] = "reduce"; break;
case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break;
}
2024-04-21 23:31:56 +00:00
}
}
2024-11-07 05:12:06 +00:00
try{
$html =
$this->get(
$proxy,
2025-01-08 02:12:07 +00:00
"https://old-search.marginalia.nu/search",
2024-11-07 05:12:06 +00:00
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
2024-04-21 23:31:56 +00:00
}
$this->fuckhtml->load($html);
$sections =
$this->fuckhtml
->getElementsByClassName(
"card search-result",
"section"
);
foreach($sections as $section){
$this->fuckhtml->load($section);
$title =
$this->fuckhtml
->getElementsByClassName(
"title",
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName(
"description",
"p"
);
if(count($description) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
$description = null;
}
$sublinks = [];
$sublink_html =
$this->fuckhtml
->getElementsByClassName("additional-results");
if(count($sublink_html) !== 0){
$this->fuckhtml->load($sublink_html[0]);
$links =
$this->fuckhtml
->getElementsByTagName("a");
foreach($links as $link){
$sublinks[] = [
"title" =>
$this->fuckhtml
->getTextContent(
$link
),
"date" => null,
"description" => null,
"url" =>
$this->fuckhtml
->getTextContent(
$link["attributes"]["href"]
)
];
}
}
2023-07-22 18:41:14 +00:00
$out["web"][] = [
2024-04-21 23:31:56 +00:00
"title" =>
$this->fuckhtml
->getTextContent(
$title
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$title["attributes"]["href"]
),
2023-07-22 18:41:14 +00:00
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
2024-04-21 23:31:56 +00:00
"sublink" => $sublinks,
2023-07-22 18:41:14 +00:00
"table" => []
];
}
2024-11-07 05:12:06 +00:00
// get next page
$this->fuckhtml->load($html);
$pagination =
$this->fuckhtml
->getElementsByAttributeValue(
"aria-label",
"pagination",
"nav"
);
if(count($pagination) === 0){
// no pagination
return $out;
}
$this->fuckhtml->load($pagination[0]);
$pages =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$found_current_page = false;
foreach($pages as $page){
if(
stripos(
$page["attributes"]["class"],
"active"
) !== false
){
$found_current_page = true;
continue;
}
if($found_current_page){
// we found current page index, and we iterated over
// the next page <a>
$out["npt"] =
$this->backend->store(
parse_url(
$page["attributes"]["href"],
PHP_URL_QUERY
),
"web",
$proxy
);
break;
}
}
2023-07-22 18:41:14 +00:00
return $out;
}
}