4get/scraper/mojeek.php

1168 lines
24 KiB
PHP

<?php
class mojeek{
public function __construct(){
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/backend.php";
$this->backend = new backend("mojeek");
}
public function getfilters($page){
switch($page){
case "web":
return [
"focus" => [
"display" => "Focus",
"option" => [
"any" => "No focus",
"blogs" => "Blogs",
"Dictionary" => "Dictionary",
"Recipes" => "Recipes",
"Time" => "Time",
"Weather" => "Weather"
]
],
"lang" => [
"display" => "Language",
"option" => [
"any" => "Any language",
"af" => "Afrikaans",
"sq" => "Albanian",
"an" => "Aragonese",
"ay" => "Aymara",
"bi" => "Bislama",
"br" => "Breton",
"ca" => "Catalan",
"kw" => "Cornish",
"co" => "Corsican",
"hr" => "Croatian",
"da" => "Danish",
"nl" => "Dutch",
"dz" => "Dzongkha",
"en" => "English",
"fj" => "Fijian",
"fi" => "Finnish",
"fr" => "French",
"gd" => "Gaelic",
"gl" => "Galician",
"de" => "German",
"ht" => "Haitian",
"io" => "Ido",
"id" => "Indonesian",
"ia" => "Interlingua",
"ie" => "Interlingue",
"ga" => "Irish",
"it" => "Italian",
"rw" => "Kinyarwanda",
"la" => "Latin",
"li" => "Limburgish",
"lb" => "Luxembourgish",
"no" => "Norwegian",
"nb" => "Norwegian Bokmål",
"nn" => "Norwegian Nynorsk",
"oc" => "Occitan (post 1500)",
"pl" => "Polish",
"pt" => "Portuguese",
"rm" => "Romansh",
"rn" => "Rundi",
"sg" => "Sango",
"so" => "Somali",
"es" => "Spanish",
"sw" => "Swahili",
"ss" => "Swati",
"sv" => "Swedish",
"ty" => "Tahitian",
"to" => "Tonga (Tonga Islands)",
"ts" => "Tsonga",
"vo" => "Volapük",
"wa" => "Walloon",
"cy" => "Welsh",
"xh" => "Xhosa",
"zu" => "Zulu"
]
],
"country" => [
"display" => "Country",
"option" => [
"any" => "No location bias",
"af" => "Afghanistan",
"ax" => "Åland Islands",
"al" => "Albania",
"dz" => "Algeria",
"as" => "American Samoa",
"ad" => "Andorra",
"ao" => "Angola",
"ai" => "Anguilla",
"aq" => "Antarctica",
"ag" => "Antigua and Barbuda",
"ar" => "Argentina",
"am" => "Armenia",
"aw" => "Aruba",
"au" => "Australia",
"at" => "Austria",
"az" => "Azerbaijan",
"bs" => "Bahamas",
"bh" => "Bahrain",
"bd" => "Bangladesh",
"bb" => "Barbados",
"by" => "Belarus",
"be" => "Belgium",
"bz" => "Belize",
"bj" => "Benin",
"bm" => "Bermuda",
"bt" => "Bhutan",
"bo" => "Bolivia (Plurinational State of)",
"bq" => "Bonaire, Sint Eustatius and Saba",
"ba" => "Bosnia and Herzegovina",
"bw" => "Botswana",
"bv" => "Bouvet Island",
"br" => "Brazil",
"io" => "British Indian Ocean Territory",
"bn" => "Brunei Darussalam",
"bg" => "Bulgaria",
"bf" => "Burkina Faso",
"bi" => "Burundi",
"cv" => "Cabo Verde",
"kh" => "Cambodia",
"cm" => "Cameroon",
"ca" => "Canada",
"ky" => "Cayman Islands",
"cf" => "Central African Republic",
"td" => "Chad",
"cl" => "Chile",
"cn" => "China",
"cx" => "Christmas Island",
"cc" => "Cocos (Keeling) Islands",
"co" => "Colombia",
"km" => "Comoros",
"cg" => "Congo",
"cd" => "Congo (Democratic Republic of the)",
"ck" => "Cook Islands",
"cr" => "Costa Rica",
"ci" => "Côte d'Ivoire",
"hr" => "Croatia",
"cu" => "Cuba",
"cw" => "Curaçao",
"cy" => "Cyprus",
"cz" => "Czechia",
"dk" => "Denmark",
"dj" => "Djibouti",
"dm" => "Dominica",
"do" => "Dominican Republic",
"ec" => "Ecuador",
"eg" => "Egypt",
"sv" => "El Salvador",
"gq" => "Equatorial Guinea",
"er" => "Eritrea",
"ee" => "Estonia",
"et" => "Ethiopia",
"fk" => "Falkland Islands (Malvinas)",
"fo" => "Faroe Islands",
"fj" => "Fiji",
"fi" => "Finland",
"fr" => "France",
"gf" => "French Guiana",
"pf" => "French Polynesia",
"tf" => "French Southern Territories",
"ga" => "Gabon",
"gm" => "Gambia",
"ge" => "Georgia",
"de" => "Germany",
"gh" => "Ghana",
"gi" => "Gibraltar",
"gr" => "Greece",
"gl" => "Greenland",
"gd" => "Grenada",
"gp" => "Guadeloupe",
"gu" => "Guam",
"gt" => "Guatemala",
"gg" => "Guernsey",
"gn" => "Guinea",
"gw" => "Guinea-Bissau",
"gy" => "Guyana",
"ht" => "Haiti",
"hm" => "Heard Island and McDonald Islands",
"va" => "Holy See",
"hn" => "Honduras",
"hk" => "Hong Kong",
"hu" => "Hungary",
"is" => "Iceland",
"in" => "India",
"id" => "Indonesia",
"ir" => "Iran (Islamic Republic of)",
"iq" => "Iraq",
"ie" => "Ireland",
"im" => "Isle of Man",
"il" => "Israel",
"it" => "Italy",
"jm" => "Jamaica",
"jp" => "Japan",
"je" => "Jersey",
"jo" => "Jordan",
"kz" => "Kazakhstan",
"ke" => "Kenya",
"ki" => "Kiribati",
"kp" => "Korea (Democratic People's Republic of)",
"kr" => "Korea (Republic of)",
"kw" => "Kuwait",
"kg" => "Kyrgyzstan",
"la" => "Lao People's Democratic Republic",
"lv" => "Latvia",
"lb" => "Lebanon",
"ls" => "Lesotho",
"lr" => "Liberia",
"ly" => "Libya",
"li" => "Liechtenstein",
"lt" => "Lithuania",
"lu" => "Luxembourg",
"mo" => "Macao",
"mk" => "Macedonia (the former Yugoslav Republic of)",
"mg" => "Madagascar",
"mw" => "Malawi",
"my" => "Malaysia",
"mv" => "Maldives",
"ml" => "Mali",
"mt" => "Malta",
"mh" => "Marshall Islands",
"mq" => "Martinique",
"mr" => "Mauritania",
"mu" => "Mauritius",
"yt" => "Mayotte",
"mx" => "Mexico",
"fm" => "Micronesia (Federated States of)",
"md" => "Moldova (Republic of)",
"mc" => "Monaco",
"mn" => "Mongolia",
"me" => "Montenegro",
"ms" => "Montserrat",
"ma" => "Morocco",
"mz" => "Mozambique",
"mm" => "Myanmar",
"na" => "Namibia",
"nr" => "Nauru",
"np" => "Nepal",
"nl" => "Netherlands",
"nc" => "New Caledonia",
"nz" => "New Zealand",
"ni" => "Nicaragua",
"ne" => "Niger",
"ng" => "Nigeria",
"nu" => "Niue",
"nf" => "Norfolk Island",
"mp" => "Northern Mariana Islands",
"no" => "Norway",
"om" => "Oman",
"pk" => "Pakistan",
"pw" => "Palau",
"ps" => "Palestine, State of",
"pa" => "Panama",
"pg" => "Papua New Guinea",
"py" => "Paraguay",
"pe" => "Peru",
"ph" => "Philippines",
"pn" => "Pitcairn",
"pl" => "Poland",
"pt" => "Portugal",
"pr" => "Puerto Rico",
"qa" => "Qatar",
"re" => "Réunion",
"ro" => "Romania",
"ru" => "Russian Federation",
"rw" => "Rwanda",
"bl" => "Saint Barthélemy",
"sh" => "Saint Helena, Ascension and Tristan da Cunha",
"kn" => "Saint Kitts and Nevis",
"lc" => "Saint Lucia",
"mf" => "Saint Martin (French part)",
"pm" => "Saint Pierre and Miquelon",
"vc" => "Saint Vincent and the Grenadines",
"ws" => "Samoa",
"sm" => "San Marino",
"st" => "Sao Tome and Principe",
"sa" => "Saudi Arabia",
"sn" => "Senegal",
"rs" => "Serbia",
"sc" => "Seychelles",
"sl" => "Sierra Leone",
"sg" => "Singapore",
"sx" => "Sint Maarten (Dutch part)",
"sk" => "Slovakia",
"si" => "Slovenia",
"sb" => "Solomon Islands",
"so" => "Somalia",
"za" => "South Africa",
"gs" => "South Georgia and South Sandwich Islands",
"ss" => "South Sudan",
"es" => "Spain",
"lk" => "Sri Lanka",
"sd" => "Sudan",
"sr" => "Suriname",
"sj" => "Svalbard and Jan Mayen",
"sz" => "Swaziland",
"se" => "Sweden",
"ch" => "Switzerland",
"sy" => "Syrian Arab Republic",
"tw" => "Taiwan",
"tj" => "Tajikistan",
"tz" => "Tanzania, United Republic of",
"th" => "Thailand",
"tl" => "Timor-Leste",
"tg" => "Togo",
"tk" => "Tokelau",
"to" => "Tonga",
"tt" => "Trinidad and Tobago",
"tn" => "Tunisia",
"tr" => "Turkey",
"tm" => "Turkmenistan",
"tc" => "Turks and Caicos Islands",
"tv" => "Tuvalu",
"ug" => "Uganda",
"ua" => "Ukraine",
"ae" => "United Arab Emirates",
"gb" => "United Kingdom",
"us" => "United States of America",
"um" => "United States Minor Outlying Islands",
"uy" => "Uruguay",
"uz" => "Uzbekistan",
"vu" => "Vanuatu",
"ve" => "Venezuela (Bolivarian Republic of)",
"vn" => "Viet Nam",
"vg" => "Virgin Islands (British)",
"vi" => "Virgin Islands (U.S.)",
"wf" => "Wallis and Futuna",
"eh" => "Western Sahara",
"ye" => "Yemen",
"zm" => "Zambia",
"zw" => "Zimbabwe"
]
],
"region" => [
"display" => "Region",
"option" => [
"any" => "Any region",
"eu" => "European Union",
"de" => "Germany",
"fr" => "France",
"uk" => "United Kingdom"
]
],
"domain" => [
"display" => "Results per domain",
"option" => [
"1" => "1 result",
"2" => "2 results",
"3" => "3 results",
"4" => "4 results",
"5" => "5 results",
"10" => "10 results",
"0" => "Unlimited",
]
]
];
break;
case "news":
return [];
}
}
private function get($proxy, $url, $get = []){
$headers = [
"User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"
];
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$token, $proxy] = $this->backend->get($get["npt"], "web");
try{
$html =
$this->get(
$proxy,
"https://www.mojeek.com" . $token,
[]
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
}else{
$search = $get["s"];
$lang = $get["lang"];
$country = $get["country"];
$region = $get["region"];
$domain = $get["domain"];
$focus = $get["focus"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$params = [
"q" => $search,
"t" => 20, // number of results/page
"tn" => 7, // number of news results/page
"date" => 1, // show date
"tlen" => 128, // max length of title
"dlen" => 511, // max length of description
"arc" => ($country == "any" ? "none" : $country) // location. don't use autodetect!
];
switch($focus){
case "any": break;
case "blogs":
$params["fmt"] = "sst";
$params["sst"] = "1";
break;
default:
$params["foc_t"] = $focus;
break;
}
if($lang != "any"){
$params["lb"] = $lang;
}
if($region != "any"){
$params["reg"] = $region;
}
if($domain != "1"){
$params["si"] = $domain;
}
$proxy = $this->backend->get_ip();
try{
$html =
$this->get(
$proxy,
"https://www.mojeek.com/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
/*
$handle = fopen("scraper/mojeek.html", "r");
$html = fread($handle, filesize("scraper/mojeek.html"));
fclose($handle);*/
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$this->fuckhtml->load($html);
$results =
$this->fuckhtml
->getElementsByClassName("results-standard", "ul");
if(count($results) === 0){
return $out;
}
/*
Get all search result divs
*/
foreach($results as $container){
$this->fuckhtml->load($container);
$results =
$this->fuckhtml
->getElementsByTagName("li");
foreach($results as $result){
$data = [
"title" => null,
"description" => null,
"url" => null,
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
$this->fuckhtml->load($result);
$title =
$this->fuckhtml
->getElementsByClassName("title", "a")[0];
$data["title"] =
html_entity_decode(
$this->fuckhtml
->getTextContent(
$title["innerHTML"]
)
);
$data["url"] =
html_entity_decode(
$this->fuckhtml
->getTextContent(
$title["attributes"]["href"]
)
);
$description =
$this->fuckhtml
->getElementsByClassName(
"s", "p"
);
if(count($description) !== 0){
$data["description"] =
$this->titledots(
html_entity_decode(
$this->fuckhtml
->getTextContent(
$description[0]
)
)
);
}
$data["date"] =
explode(
" - ",
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName("i", "p")[1]
)
);
$data["date"] =
strtotime(
$data["date"][count($data["date"]) - 1]
);
$out["web"][] = $data;
}
}
/*
Get instant answers
*/
$this->fuckhtml->load($html);
$infoboxes =
$this->fuckhtml
->getElementsByClassName(
"infobox infobox-top",
"div"
);
foreach($infoboxes as $infobox){
$answer = [
"title" => null,
"description" => [],
"url" => null,
"thumb" => null,
"table" => [],
"sublink" => []
];
// load first part with title + short definition
$infobox_html =
explode(
"<hr>",
$infobox["innerHTML"]
);
$this->fuckhtml->load($infobox_html[0]);
// title
$answer["title"] =
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName("h1")[0]
);
// short definition
$definition =
$this->fuckhtml
->getElementsByTagName(
"p"
);
if(count($definition) !== 0){
$answer["description"][] = [
"type" => "quote",
"value" =>
$this->fuckhtml
->getTextContent(
$definition[0]
)
];
}
// get thumbnail, if it exists
$this->fuckhtml->load($infobox_html[1]);
$thumb =
$this->fuckhtml
->getElementsByClassName("float-right", "img");
if(count($thumb) !== 0){
preg_match(
'/\/image\?img=([^&]+)/i',
$thumb[0]["attributes"]["src"],
$thumb
);
if(count($thumb) === 2){
$answer["thumb"] =
$this->fuckhtml
->getTextContent(
$thumb[1]
);
}
}
// get description
$ps =
$this->fuckhtml
->getElementsByTagName("p");
$first_tag = true;
foreach($ps as $p){
$this->fuckhtml->load($p);
if(
preg_match(
'/^\s*<strong>/i',
$p["innerHTML"]
)
){
/*
Parse table
*/
$strong =
$this->fuckhtml
->getElementsByTagName("strong")[0];
$p["innerHTML"] =
str_replace($strong["innerHTML"], "", $p["innerHTML"]);
$strong =
preg_replace(
'/:$/',
"",
ucfirst(
$this->fuckhtml
->getTextContent(
$strong
)
)
);
$answer["table"][trim($strong)] =
trim(
$this->fuckhtml
->getTextContent(
$p
)
);
continue;
}
$as =
$this->fuckhtml
->getElementsByClassName("svg-icon");
if(count($as) !== 0){
/*
Parse websites
*/
foreach($as as $a){
$answer["sublink"][
ucfirst(explode(" ", $a["attributes"]["class"], 2)[1])
] =
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
);
}
continue;
}
/*
Parse text content
*/
$tags =
$this->fuckhtml
->getElementsByTagName("*");
$i = 0;
foreach($tags as $tag){
$c = count($answer["description"]);
// remove tag from innerHTML
$p["innerHTML"] =
explode($tag["outerHTML"], $p["innerHTML"], 2);
if(count($p["innerHTML"]) === 2){
if(
$i === 0 &&
$c !== 0 &&
$answer["description"][$c - 1]["type"] == "link"
){
$append = "\n\n";
}else{
$append = "";
}
if($p["innerHTML"][0] != ""){
$answer["description"][] = [
"type" => "text",
"value" => $append . trim($p["innerHTML"][0])
];
}
$p["innerHTML"] = $p["innerHTML"][1];
}else{
$p["innerHTML"] = $p["innerHTML"][0];
}
switch($tag["tagName"]){
case "a":
$value =
$this->fuckhtml
->getTextContent(
$tag
);
if(strtolower($value) == "wikipedia"){
if($c !== 0){
$answer["description"][$c - 1]["value"] =
rtrim($answer["description"][$c - 1]["value"]);
}
break;
}
$answer["description"][] = [
"type" => "link",
"url" =>
$this->fuckhtml
->getTextContent(
$tag["attributes"]["href"]
),
"value" =>
$this->fuckhtml
->getTextContent(
$tag
)
];
break;
}
$i++;
}
}
// get URL
$this->fuckhtml->load($infobox_html[2]);
$answer["url"] =
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName(
"a"
)[0]
["attributes"]
["href"]
);
// append answer
$out["answer"][] = $answer;
}
/*
Get news
*/
$this->fuckhtml->load($html);
$news =
$this->fuckhtml
->getElementsByClassName(
"results news-results",
"div"
);
if(count($news) !== 0){
$this->fuckhtml->load($news[0]);
$lis =
$this->fuckhtml
->getElementsByTagName("li");
foreach($lis as $li){
$this->fuckhtml->load($li);
$a =
$this->fuckhtml
->getElementsByClassName(
"ob",
"a"
);
if(count($a) === 0){
continue;
}
$a = $a[0];
$date =
explode(
" - ",
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName(
"span"
)[0]
)
);
$date =
strtotime(
$date[count($date) - 1]
);
$out["news"][] = [
"title" =>
html_entity_decode(
$this->fuckhtml
->getTextContent(
$a
)
),
"description" => null,
"date" => $date,
"thumb" => [
"url" => null,
"ratio" => null
],
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
)
];
}
}
/*
Get next page
*/
$this->fuckhtml->load($html);
$pagination =
$this->fuckhtml
->getElementsByClassName("pagination");
if(count($pagination) !== false){
$this->fuckhtml->load($pagination[0]);
$as =
$this->fuckhtml
->getElementsByTagName("a");
foreach($as as $a){
if($a["innerHTML"] == "Next"){
$out["npt"] = $this->backend->store(
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"web",
$proxy
);
}
}
}
return $out;
}
public function news($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$out = [
"status" => "ok",
"npt" => null,
"news" => []
];
try{
$html =
$this->get(
$this->backend->get_ip(),
"https://www.mojeek.com/search",
[
"q" => $search,
"fmt" => "news"
]
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
/*
$handle = fopen("scraper/mojeek.html", "r");
$html = fread($handle, filesize("scraper/mojeek.html"));
fclose($handle);
*/
$this->fuckhtml->load($html);
$articles =
$this->fuckhtml->getElementsByTagName("article");
foreach($articles as $article){
$this->fuckhtml->load($article);
$data = [
"title" => null,
"author" => null,
"description" => null,
"date" => null,
"thumb" =>
[
"url" => null,
"ratio" => null
],
"url" => null
];
$a = $this->fuckhtml->getElementsByTagName("a")[0];
$data["title"] =
$this->fuckhtml
->getTextContent(
$a["attributes"]["title"]
);
$data["url"] =
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
);
$p = $this->fuckhtml->getElementsByTagName("p");
$data["description"] =
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"s",
$p
)[0]
)
);
if($data["description"] == ""){
$data["description"] = null;
}
// get date from big node
$date =
$this->fuckhtml
->getElementsByClassName(
"date",
$p
);
if(count($date) !== 0){
$data["date"] =
strtotime(
$this->fuckhtml
->getTextContent(
$date[0]
)
);
}
// grep date + author
$s =
$this->fuckhtml
->getElementsByClassName(
"i",
$p
)[0];
$this->fuckhtml->load($s);
$a =
$this->fuckhtml
->getElementsByTagName("a");
if(count($a) !== 0){
// parse big node information
$data["author"] =
$this->fuckhtml
->getTextContent(
$a[0]["innerHTML"]
);
}else{
// parse smaller nodes
$replace =
$this->fuckhtml
->getElementsByTagName("time")[0];
$data["date"] =
strtotime(
$this->fuckhtml
->getTextContent(
$replace
)
);
$s["innerHTML"] =
str_replace(
$replace["outerHTML"],
"",
$s["innerHTML"]
);
$data["author"] =
preg_replace(
'/ &bull; $/',
"",
$s["innerHTML"]
);
}
$out["news"][] = $data;
}
return $out;
}
private function titledots($title){
return trim($title, ". \t\n\r\0\x0B");
}
}