1183 lines
24 KiB
PHP
1183 lines
24 KiB
PHP
|
<?php
|
||
|
|
||
|
class mojeek{
|
||
|
public function __construct(){
|
||
|
|
||
|
include "lib/fuckhtml.php";
|
||
|
$this->fuckhtml = new fuckhtml();
|
||
|
|
||
|
include "lib/nextpage.php";
|
||
|
$this->nextpage = new nextpage("mojeek");
|
||
|
}
|
||
|
|
||
|
public function getfilters($page){
|
||
|
|
||
|
switch($page){
|
||
|
|
||
|
case "web":
|
||
|
return [
|
||
|
"focus" => [
|
||
|
"display" => "Focus",
|
||
|
"option" => [
|
||
|
"any" => "No focus",
|
||
|
"blogs" => "Blogs",
|
||
|
"Dictionary" => "Dictionary",
|
||
|
"Recipes" => "Recipes",
|
||
|
"Time" => "Time",
|
||
|
"Weather" => "Weather"
|
||
|
]
|
||
|
],
|
||
|
"lang" => [
|
||
|
"display" => "Language",
|
||
|
"option" => [
|
||
|
"any" => "Any language",
|
||
|
"af" => "Afrikaans",
|
||
|
"sq" => "Albanian",
|
||
|
"an" => "Aragonese",
|
||
|
"ay" => "Aymara",
|
||
|
"bi" => "Bislama",
|
||
|
"br" => "Breton",
|
||
|
"ca" => "Catalan",
|
||
|
"kw" => "Cornish",
|
||
|
"co" => "Corsican",
|
||
|
"hr" => "Croatian",
|
||
|
"da" => "Danish",
|
||
|
"nl" => "Dutch",
|
||
|
"dz" => "Dzongkha",
|
||
|
"en" => "English",
|
||
|
"fj" => "Fijian",
|
||
|
"fi" => "Finnish",
|
||
|
"fr" => "French",
|
||
|
"gd" => "Gaelic",
|
||
|
"gl" => "Galician",
|
||
|
"de" => "German",
|
||
|
"ht" => "Haitian",
|
||
|
"io" => "Ido",
|
||
|
"id" => "Indonesian",
|
||
|
"ia" => "Interlingua",
|
||
|
"ie" => "Interlingue",
|
||
|
"ga" => "Irish",
|
||
|
"it" => "Italian",
|
||
|
"rw" => "Kinyarwanda",
|
||
|
"la" => "Latin",
|
||
|
"li" => "Limburgish",
|
||
|
"lb" => "Luxembourgish",
|
||
|
"no" => "Norwegian",
|
||
|
"nb" => "Norwegian Bokmål",
|
||
|
"nn" => "Norwegian Nynorsk",
|
||
|
"oc" => "Occitan (post 1500)",
|
||
|
"pl" => "Polish",
|
||
|
"pt" => "Portuguese",
|
||
|
"rm" => "Romansh",
|
||
|
"rn" => "Rundi",
|
||
|
"sg" => "Sango",
|
||
|
"so" => "Somali",
|
||
|
"es" => "Spanish",
|
||
|
"sw" => "Swahili",
|
||
|
"ss" => "Swati",
|
||
|
"sv" => "Swedish",
|
||
|
"ty" => "Tahitian",
|
||
|
"to" => "Tonga (Tonga Islands)",
|
||
|
"ts" => "Tsonga",
|
||
|
"vo" => "Volapük",
|
||
|
"wa" => "Walloon",
|
||
|
"cy" => "Welsh",
|
||
|
"xh" => "Xhosa",
|
||
|
"zu" => "Zulu"
|
||
|
]
|
||
|
],
|
||
|
"country" => [
|
||
|
"display" => "Country",
|
||
|
"option" => [
|
||
|
"any" => "No location bias",
|
||
|
"af" => "Afghanistan",
|
||
|
"ax" => "Åland Islands",
|
||
|
"al" => "Albania",
|
||
|
"dz" => "Algeria",
|
||
|
"as" => "American Samoa",
|
||
|
"ad" => "Andorra",
|
||
|
"ao" => "Angola",
|
||
|
"ai" => "Anguilla",
|
||
|
"aq" => "Antarctica",
|
||
|
"ag" => "Antigua and Barbuda",
|
||
|
"ar" => "Argentina",
|
||
|
"am" => "Armenia",
|
||
|
"aw" => "Aruba",
|
||
|
"au" => "Australia",
|
||
|
"at" => "Austria",
|
||
|
"az" => "Azerbaijan",
|
||
|
"bs" => "Bahamas",
|
||
|
"bh" => "Bahrain",
|
||
|
"bd" => "Bangladesh",
|
||
|
"bb" => "Barbados",
|
||
|
"by" => "Belarus",
|
||
|
"be" => "Belgium",
|
||
|
"bz" => "Belize",
|
||
|
"bj" => "Benin",
|
||
|
"bm" => "Bermuda",
|
||
|
"bt" => "Bhutan",
|
||
|
"bo" => "Bolivia (Plurinational State of)",
|
||
|
"bq" => "Bonaire, Sint Eustatius and Saba",
|
||
|
"ba" => "Bosnia and Herzegovina",
|
||
|
"bw" => "Botswana",
|
||
|
"bv" => "Bouvet Island",
|
||
|
"br" => "Brazil",
|
||
|
"io" => "British Indian Ocean Territory",
|
||
|
"bn" => "Brunei Darussalam",
|
||
|
"bg" => "Bulgaria",
|
||
|
"bf" => "Burkina Faso",
|
||
|
"bi" => "Burundi",
|
||
|
"cv" => "Cabo Verde",
|
||
|
"kh" => "Cambodia",
|
||
|
"cm" => "Cameroon",
|
||
|
"ca" => "Canada",
|
||
|
"ky" => "Cayman Islands",
|
||
|
"cf" => "Central African Republic",
|
||
|
"td" => "Chad",
|
||
|
"cl" => "Chile",
|
||
|
"cn" => "China",
|
||
|
"cx" => "Christmas Island",
|
||
|
"cc" => "Cocos (Keeling) Islands",
|
||
|
"co" => "Colombia",
|
||
|
"km" => "Comoros",
|
||
|
"cg" => "Congo",
|
||
|
"cd" => "Congo (Democratic Republic of the)",
|
||
|
"ck" => "Cook Islands",
|
||
|
"cr" => "Costa Rica",
|
||
|
"ci" => "Côte d'Ivoire",
|
||
|
"hr" => "Croatia",
|
||
|
"cu" => "Cuba",
|
||
|
"cw" => "Curaçao",
|
||
|
"cy" => "Cyprus",
|
||
|
"cz" => "Czechia",
|
||
|
"dk" => "Denmark",
|
||
|
"dj" => "Djibouti",
|
||
|
"dm" => "Dominica",
|
||
|
"do" => "Dominican Republic",
|
||
|
"ec" => "Ecuador",
|
||
|
"eg" => "Egypt",
|
||
|
"sv" => "El Salvador",
|
||
|
"gq" => "Equatorial Guinea",
|
||
|
"er" => "Eritrea",
|
||
|
"ee" => "Estonia",
|
||
|
"et" => "Ethiopia",
|
||
|
"fk" => "Falkland Islands (Malvinas)",
|
||
|
"fo" => "Faroe Islands",
|
||
|
"fj" => "Fiji",
|
||
|
"fi" => "Finland",
|
||
|
"fr" => "France",
|
||
|
"gf" => "French Guiana",
|
||
|
"pf" => "French Polynesia",
|
||
|
"tf" => "French Southern Territories",
|
||
|
"ga" => "Gabon",
|
||
|
"gm" => "Gambia",
|
||
|
"ge" => "Georgia",
|
||
|
"de" => "Germany",
|
||
|
"gh" => "Ghana",
|
||
|
"gi" => "Gibraltar",
|
||
|
"gr" => "Greece",
|
||
|
"gl" => "Greenland",
|
||
|
"gd" => "Grenada",
|
||
|
"gp" => "Guadeloupe",
|
||
|
"gu" => "Guam",
|
||
|
"gt" => "Guatemala",
|
||
|
"gg" => "Guernsey",
|
||
|
"gn" => "Guinea",
|
||
|
"gw" => "Guinea-Bissau",
|
||
|
"gy" => "Guyana",
|
||
|
"ht" => "Haiti",
|
||
|
"hm" => "Heard Island and McDonald Islands",
|
||
|
"va" => "Holy See",
|
||
|
"hn" => "Honduras",
|
||
|
"hk" => "Hong Kong",
|
||
|
"hu" => "Hungary",
|
||
|
"is" => "Iceland",
|
||
|
"in" => "India",
|
||
|
"id" => "Indonesia",
|
||
|
"ir" => "Iran (Islamic Republic of)",
|
||
|
"iq" => "Iraq",
|
||
|
"ie" => "Ireland",
|
||
|
"im" => "Isle of Man",
|
||
|
"il" => "Israel",
|
||
|
"it" => "Italy",
|
||
|
"jm" => "Jamaica",
|
||
|
"jp" => "Japan",
|
||
|
"je" => "Jersey",
|
||
|
"jo" => "Jordan",
|
||
|
"kz" => "Kazakhstan",
|
||
|
"ke" => "Kenya",
|
||
|
"ki" => "Kiribati",
|
||
|
"kp" => "Korea (Democratic People's Republic of)",
|
||
|
"kr" => "Korea (Republic of)",
|
||
|
"kw" => "Kuwait",
|
||
|
"kg" => "Kyrgyzstan",
|
||
|
"la" => "Lao People's Democratic Republic",
|
||
|
"lv" => "Latvia",
|
||
|
"lb" => "Lebanon",
|
||
|
"ls" => "Lesotho",
|
||
|
"lr" => "Liberia",
|
||
|
"ly" => "Libya",
|
||
|
"li" => "Liechtenstein",
|
||
|
"lt" => "Lithuania",
|
||
|
"lu" => "Luxembourg",
|
||
|
"mo" => "Macao",
|
||
|
"mk" => "Macedonia (the former Yugoslav Republic of)",
|
||
|
"mg" => "Madagascar",
|
||
|
"mw" => "Malawi",
|
||
|
"my" => "Malaysia",
|
||
|
"mv" => "Maldives",
|
||
|
"ml" => "Mali",
|
||
|
"mt" => "Malta",
|
||
|
"mh" => "Marshall Islands",
|
||
|
"mq" => "Martinique",
|
||
|
"mr" => "Mauritania",
|
||
|
"mu" => "Mauritius",
|
||
|
"yt" => "Mayotte",
|
||
|
"mx" => "Mexico",
|
||
|
"fm" => "Micronesia (Federated States of)",
|
||
|
"md" => "Moldova (Republic of)",
|
||
|
"mc" => "Monaco",
|
||
|
"mn" => "Mongolia",
|
||
|
"me" => "Montenegro",
|
||
|
"ms" => "Montserrat",
|
||
|
"ma" => "Morocco",
|
||
|
"mz" => "Mozambique",
|
||
|
"mm" => "Myanmar",
|
||
|
"na" => "Namibia",
|
||
|
"nr" => "Nauru",
|
||
|
"np" => "Nepal",
|
||
|
"nl" => "Netherlands",
|
||
|
"nc" => "New Caledonia",
|
||
|
"nz" => "New Zealand",
|
||
|
"ni" => "Nicaragua",
|
||
|
"ne" => "Niger",
|
||
|
"ng" => "Nigeria",
|
||
|
"nu" => "Niue",
|
||
|
"nf" => "Norfolk Island",
|
||
|
"mp" => "Northern Mariana Islands",
|
||
|
"no" => "Norway",
|
||
|
"om" => "Oman",
|
||
|
"pk" => "Pakistan",
|
||
|
"pw" => "Palau",
|
||
|
"ps" => "Palestine, State of",
|
||
|
"pa" => "Panama",
|
||
|
"pg" => "Papua New Guinea",
|
||
|
"py" => "Paraguay",
|
||
|
"pe" => "Peru",
|
||
|
"ph" => "Philippines",
|
||
|
"pn" => "Pitcairn",
|
||
|
"pl" => "Poland",
|
||
|
"pt" => "Portugal",
|
||
|
"pr" => "Puerto Rico",
|
||
|
"qa" => "Qatar",
|
||
|
"re" => "Réunion",
|
||
|
"ro" => "Romania",
|
||
|
"ru" => "Russian Federation",
|
||
|
"rw" => "Rwanda",
|
||
|
"bl" => "Saint Barthélemy",
|
||
|
"sh" => "Saint Helena, Ascension and Tristan da Cunha",
|
||
|
"kn" => "Saint Kitts and Nevis",
|
||
|
"lc" => "Saint Lucia",
|
||
|
"mf" => "Saint Martin (French part)",
|
||
|
"pm" => "Saint Pierre and Miquelon",
|
||
|
"vc" => "Saint Vincent and the Grenadines",
|
||
|
"ws" => "Samoa",
|
||
|
"sm" => "San Marino",
|
||
|
"st" => "Sao Tome and Principe",
|
||
|
"sa" => "Saudi Arabia",
|
||
|
"sn" => "Senegal",
|
||
|
"rs" => "Serbia",
|
||
|
"sc" => "Seychelles",
|
||
|
"sl" => "Sierra Leone",
|
||
|
"sg" => "Singapore",
|
||
|
"sx" => "Sint Maarten (Dutch part)",
|
||
|
"sk" => "Slovakia",
|
||
|
"si" => "Slovenia",
|
||
|
"sb" => "Solomon Islands",
|
||
|
"so" => "Somalia",
|
||
|
"za" => "South Africa",
|
||
|
"gs" => "South Georgia and South Sandwich Islands",
|
||
|
"ss" => "South Sudan",
|
||
|
"es" => "Spain",
|
||
|
"lk" => "Sri Lanka",
|
||
|
"sd" => "Sudan",
|
||
|
"sr" => "Suriname",
|
||
|
"sj" => "Svalbard and Jan Mayen",
|
||
|
"sz" => "Swaziland",
|
||
|
"se" => "Sweden",
|
||
|
"ch" => "Switzerland",
|
||
|
"sy" => "Syrian Arab Republic",
|
||
|
"tw" => "Taiwan",
|
||
|
"tj" => "Tajikistan",
|
||
|
"tz" => "Tanzania, United Republic of",
|
||
|
"th" => "Thailand",
|
||
|
"tl" => "Timor-Leste",
|
||
|
"tg" => "Togo",
|
||
|
"tk" => "Tokelau",
|
||
|
"to" => "Tonga",
|
||
|
"tt" => "Trinidad and Tobago",
|
||
|
"tn" => "Tunisia",
|
||
|
"tr" => "Turkey",
|
||
|
"tm" => "Turkmenistan",
|
||
|
"tc" => "Turks and Caicos Islands",
|
||
|
"tv" => "Tuvalu",
|
||
|
"ug" => "Uganda",
|
||
|
"ua" => "Ukraine",
|
||
|
"ae" => "United Arab Emirates",
|
||
|
"gb" => "United Kingdom",
|
||
|
"us" => "United States of America",
|
||
|
"um" => "United States Minor Outlying Islands",
|
||
|
"uy" => "Uruguay",
|
||
|
"uz" => "Uzbekistan",
|
||
|
"vu" => "Vanuatu",
|
||
|
"ve" => "Venezuela (Bolivarian Republic of)",
|
||
|
"vn" => "Viet Nam",
|
||
|
"vg" => "Virgin Islands (British)",
|
||
|
"vi" => "Virgin Islands (U.S.)",
|
||
|
"wf" => "Wallis and Futuna",
|
||
|
"eh" => "Western Sahara",
|
||
|
"ye" => "Yemen",
|
||
|
"zm" => "Zambia",
|
||
|
"zw" => "Zimbabwe"
|
||
|
]
|
||
|
],
|
||
|
"region" => [
|
||
|
"display" => "Region",
|
||
|
"option" => [
|
||
|
"any" => "Any region",
|
||
|
"eu" => "European Union",
|
||
|
"de" => "Germany",
|
||
|
"fr" => "France",
|
||
|
"uk" => "United Kingdom"
|
||
|
]
|
||
|
],
|
||
|
"domain" => [
|
||
|
"display" => "Results per domain",
|
||
|
"option" => [
|
||
|
"1" => "1 result",
|
||
|
"2" => "2 results",
|
||
|
"3" => "3 results",
|
||
|
"4" => "4 results",
|
||
|
"5" => "5 results",
|
||
|
"10" => "10 results",
|
||
|
"0" => "Unlimited",
|
||
|
]
|
||
|
]
|
||
|
];
|
||
|
break;
|
||
|
|
||
|
case "news":
|
||
|
return [];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private function get($url, $get = []){
|
||
|
|
||
|
$headers = [
|
||
|
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
|
||
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||
|
"Accept-Language: en-US,en;q=0.5",
|
||
|
"Accept-Encoding: gzip",
|
||
|
"DNT: 1",
|
||
|
"Connection: keep-alive",
|
||
|
"Upgrade-Insecure-Requests: 1",
|
||
|
"Sec-Fetch-Dest: document",
|
||
|
"Sec-Fetch-Mode: navigate",
|
||
|
"Sec-Fetch-Site: none",
|
||
|
"Sec-Fetch-User: ?1"
|
||
|
];
|
||
|
|
||
|
$curlproc = curl_init();
|
||
|
|
||
|
if($get !== []){
|
||
|
$get = http_build_query($get);
|
||
|
$url .= "?" . $get;
|
||
|
}
|
||
|
|
||
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||
|
|
||
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
|
||
|
|
||
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||
|
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||
|
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||
|
|
||
|
$data = curl_exec($curlproc);
|
||
|
|
||
|
if(curl_errno($curlproc)){
|
||
|
|
||
|
throw new Exception(curl_error($curlproc));
|
||
|
}
|
||
|
|
||
|
curl_close($curlproc);
|
||
|
return $data;
|
||
|
}
|
||
|
|
||
|
public function web($get){
|
||
|
|
||
|
if($get["npt"]){
|
||
|
|
||
|
$token = $this->nextpage->get($get["npt"], "web");
|
||
|
|
||
|
try{
|
||
|
$html =
|
||
|
$this->get(
|
||
|
"https://www.mojeek.com" . $token,
|
||
|
[]
|
||
|
);
|
||
|
}catch(Exception $error){
|
||
|
|
||
|
throw new Exception("Failed to get HTML");
|
||
|
}
|
||
|
|
||
|
}else{
|
||
|
$search = $get["s"];
|
||
|
$lang = $get["lang"];
|
||
|
$country = $get["country"];
|
||
|
$region = $get["region"];
|
||
|
$domain = $get["domain"];
|
||
|
$focus = $get["focus"];
|
||
|
|
||
|
if(strlen($search) === 0){
|
||
|
|
||
|
throw new Exception("Search term is empty!");
|
||
|
}
|
||
|
|
||
|
$params = [
|
||
|
"q" => $search,
|
||
|
"t" => 20, // number of results/page
|
||
|
"tn" => 7, // number of news results/page
|
||
|
"date" => 1, // show date
|
||
|
"tlen" => 128, // max length of title
|
||
|
"dlen" => 511, // max length of description
|
||
|
"arc" => ($country == "any" ? "none" : $country) // location. don't use autodetect!
|
||
|
];
|
||
|
|
||
|
switch($focus){
|
||
|
|
||
|
case "any": break;
|
||
|
|
||
|
case "blogs":
|
||
|
$params["fmt"] = "sst";
|
||
|
$params["sst"] = "1";
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
$params["foc_t"] = $focus;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if($lang != "any"){
|
||
|
|
||
|
$params["lb"] = $lang;
|
||
|
}
|
||
|
|
||
|
if($region != "any"){
|
||
|
|
||
|
$params["reg"] = $region;
|
||
|
}
|
||
|
|
||
|
if($domain != "1"){
|
||
|
|
||
|
$params["si"] = $domain;
|
||
|
}
|
||
|
|
||
|
try{
|
||
|
$html =
|
||
|
$this->get(
|
||
|
"https://www.mojeek.com/search",
|
||
|
$params
|
||
|
);
|
||
|
}catch(Exception $error){
|
||
|
|
||
|
throw new Exception("Failed to get HTML");
|
||
|
}
|
||
|
/*
|
||
|
$handle = fopen("scraper/mojeek.html", "r");
|
||
|
$html = fread($handle, filesize("scraper/mojeek.html"));
|
||
|
fclose($handle);*/
|
||
|
|
||
|
}
|
||
|
|
||
|
$out = [
|
||
|
"status" => "ok",
|
||
|
"spelling" => [
|
||
|
"type" => "no_correction",
|
||
|
"using" => null,
|
||
|
"correction" => null
|
||
|
],
|
||
|
"npt" => null,
|
||
|
"answer" => [],
|
||
|
"web" => [],
|
||
|
"image" => [],
|
||
|
"video" => [],
|
||
|
"news" => [],
|
||
|
"related" => []
|
||
|
];
|
||
|
|
||
|
$this->fuckhtml->load($html);
|
||
|
|
||
|
$results =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("results-standard", "ul");
|
||
|
|
||
|
if(count($results) === 0){
|
||
|
|
||
|
return $out;
|
||
|
}
|
||
|
|
||
|
$this->fuckhtml->load($results[0]);
|
||
|
|
||
|
/*
|
||
|
Get search results
|
||
|
*/
|
||
|
$results =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("li");
|
||
|
|
||
|
foreach($results as $result){
|
||
|
|
||
|
$data = [
|
||
|
"title" => null,
|
||
|
"description" => null,
|
||
|
"url" => null,
|
||
|
"date" => null,
|
||
|
"type" => "web",
|
||
|
"thumb" => [
|
||
|
"url" => null,
|
||
|
"ratio" => null
|
||
|
],
|
||
|
"sublink" => [],
|
||
|
"table" => []
|
||
|
];
|
||
|
|
||
|
$this->fuckhtml->load($result);
|
||
|
|
||
|
$title =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("title", "a")[0];
|
||
|
|
||
|
$data["title"] =
|
||
|
html_entity_decode(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$title["innerHTML"]
|
||
|
)
|
||
|
);
|
||
|
|
||
|
$data["url"] =
|
||
|
html_entity_decode(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$title["attributes"]["href"]
|
||
|
)
|
||
|
);
|
||
|
|
||
|
$description =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName(
|
||
|
"s", "p"
|
||
|
);
|
||
|
|
||
|
if(count($description) !== 0){
|
||
|
|
||
|
$data["description"] =
|
||
|
$this->titledots(
|
||
|
html_entity_decode(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$description[0]
|
||
|
)
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
$data["date"] =
|
||
|
explode(
|
||
|
" - ",
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("i", "p")[1]
|
||
|
)
|
||
|
);
|
||
|
|
||
|
$data["date"] =
|
||
|
strtotime(
|
||
|
$data["date"][count($data["date"]) - 1]
|
||
|
);
|
||
|
|
||
|
$out["web"][] = $data;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Get instant answers
|
||
|
*/
|
||
|
$this->fuckhtml->load($html);
|
||
|
|
||
|
$infoboxes =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName(
|
||
|
"infobox infobox-top",
|
||
|
"div"
|
||
|
);
|
||
|
|
||
|
foreach($infoboxes as $infobox){
|
||
|
|
||
|
$answer = [
|
||
|
"title" => null,
|
||
|
"description" => [],
|
||
|
"url" => null,
|
||
|
"thumb" => null,
|
||
|
"table" => [],
|
||
|
"sublink" => []
|
||
|
];
|
||
|
|
||
|
// load first part with title + short definition
|
||
|
$infobox_html =
|
||
|
explode(
|
||
|
"<hr>",
|
||
|
$infobox["innerHTML"]
|
||
|
);
|
||
|
|
||
|
$this->fuckhtml->load($infobox_html[0]);
|
||
|
|
||
|
// title
|
||
|
$answer["title"] =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("h1")[0]
|
||
|
);
|
||
|
|
||
|
// short definition
|
||
|
$definition =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName(
|
||
|
"p"
|
||
|
);
|
||
|
|
||
|
if(count($definition) !== 0){
|
||
|
|
||
|
$answer["description"][] = [
|
||
|
"type" => "quote",
|
||
|
"value" =>
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$definition[0]
|
||
|
)
|
||
|
];
|
||
|
}
|
||
|
|
||
|
// get thumbnail, if it exists
|
||
|
$this->fuckhtml->load($infobox_html[1]);
|
||
|
|
||
|
$thumb =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("float-right", "img");
|
||
|
|
||
|
if(count($thumb) !== 0){
|
||
|
|
||
|
preg_match(
|
||
|
'/\/image\?img=([^&]+)/i',
|
||
|
$thumb[0]["attributes"]["src"],
|
||
|
$thumb
|
||
|
);
|
||
|
|
||
|
if(count($thumb) === 2){
|
||
|
|
||
|
$answer["thumb"] =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$thumb[1]
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// get description
|
||
|
$ps =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("p");
|
||
|
|
||
|
$first_tag = true;
|
||
|
foreach($ps as $p){
|
||
|
|
||
|
$this->fuckhtml->load($p);
|
||
|
|
||
|
if(
|
||
|
preg_match(
|
||
|
'/^\s*<strong>/i',
|
||
|
$p["innerHTML"]
|
||
|
)
|
||
|
){
|
||
|
|
||
|
/*
|
||
|
Parse table
|
||
|
*/
|
||
|
|
||
|
$strong =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("strong")[0];
|
||
|
|
||
|
$p["innerHTML"] =
|
||
|
str_replace($strong["innerHTML"], "", $p["innerHTML"]);
|
||
|
|
||
|
$strong =
|
||
|
preg_replace(
|
||
|
'/:$/',
|
||
|
"",
|
||
|
ucfirst(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$strong
|
||
|
)
|
||
|
)
|
||
|
);
|
||
|
|
||
|
$answer["table"][trim($strong)] =
|
||
|
trim(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$p
|
||
|
)
|
||
|
);
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
$as =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("svg-icon");
|
||
|
|
||
|
if(count($as) !== 0){
|
||
|
|
||
|
/*
|
||
|
Parse websites
|
||
|
*/
|
||
|
foreach($as as $a){
|
||
|
|
||
|
$answer["sublink"][
|
||
|
ucfirst(explode(" ", $a["attributes"]["class"], 2)[1])
|
||
|
] =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$a["attributes"]["href"]
|
||
|
);
|
||
|
}
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Parse text content
|
||
|
*/
|
||
|
$tags =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("*");
|
||
|
|
||
|
$i = 0;
|
||
|
foreach($tags as $tag){
|
||
|
|
||
|
$c = count($answer["description"]);
|
||
|
|
||
|
// remove tag from innerHTML
|
||
|
$p["innerHTML"] =
|
||
|
explode($tag["outerHTML"], $p["innerHTML"], 2);
|
||
|
|
||
|
if(count($p["innerHTML"]) === 2){
|
||
|
|
||
|
if(
|
||
|
$i === 0 &&
|
||
|
$c !== 0 &&
|
||
|
$answer["description"][$c - 1]["type"] == "link"
|
||
|
){
|
||
|
|
||
|
$append = "\n\n";
|
||
|
}else{
|
||
|
|
||
|
$append = "";
|
||
|
}
|
||
|
|
||
|
if($p["innerHTML"][0] != ""){
|
||
|
$answer["description"][] = [
|
||
|
"type" => "text",
|
||
|
"value" => $append . trim($p["innerHTML"][0])
|
||
|
];
|
||
|
}
|
||
|
|
||
|
$p["innerHTML"] = $p["innerHTML"][1];
|
||
|
}else{
|
||
|
|
||
|
$p["innerHTML"] = $p["innerHTML"][0];
|
||
|
}
|
||
|
|
||
|
switch($tag["tagName"]){
|
||
|
|
||
|
case "a":
|
||
|
|
||
|
$value =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$tag
|
||
|
);
|
||
|
|
||
|
if(strtolower($value) == "wikipedia"){
|
||
|
|
||
|
if($c !== 0){
|
||
|
$answer["description"][$c - 1]["value"] =
|
||
|
rtrim($answer["description"][$c - 1]["value"]);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
$answer["description"][] = [
|
||
|
"type" => "link",
|
||
|
"url" =>
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$tag["attributes"]["href"]
|
||
|
),
|
||
|
"value" =>
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$tag
|
||
|
)
|
||
|
];
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
$i++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// get URL
|
||
|
$this->fuckhtml->load($infobox_html[2]);
|
||
|
|
||
|
$answer["url"] =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName(
|
||
|
"a"
|
||
|
)[0]
|
||
|
["attributes"]
|
||
|
["href"]
|
||
|
);
|
||
|
|
||
|
// append answer
|
||
|
$out["answer"][] = $answer;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Get news
|
||
|
*/
|
||
|
$this->fuckhtml->load($html);
|
||
|
|
||
|
$news =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName(
|
||
|
"results news-results",
|
||
|
"div"
|
||
|
);
|
||
|
|
||
|
if(count($news) !== 0){
|
||
|
|
||
|
$this->fuckhtml->load($news[0]);
|
||
|
|
||
|
$lis =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("li");
|
||
|
|
||
|
foreach($lis as $li){
|
||
|
|
||
|
$this->fuckhtml->load($li);
|
||
|
|
||
|
$a =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName(
|
||
|
"ob",
|
||
|
"a"
|
||
|
);
|
||
|
|
||
|
if(count($a) === 0){
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
$a = $a[0];
|
||
|
|
||
|
$out["news"][] = [
|
||
|
"title" =>
|
||
|
html_entity_decode(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$a
|
||
|
)
|
||
|
),
|
||
|
"description" => null,
|
||
|
"date" =>
|
||
|
strtotime(
|
||
|
explode(
|
||
|
" - ",
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName(
|
||
|
"span"
|
||
|
)[0]
|
||
|
),
|
||
|
2
|
||
|
)[1]
|
||
|
),
|
||
|
"thumb" => [
|
||
|
"url" => null,
|
||
|
"ratio" => null
|
||
|
],
|
||
|
"url" =>
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$a["attributes"]["href"]
|
||
|
)
|
||
|
];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Get next page
|
||
|
*/
|
||
|
$this->fuckhtml->load($html);
|
||
|
|
||
|
$pagination =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("pagination");
|
||
|
|
||
|
if(count($pagination) !== false){
|
||
|
|
||
|
$this->fuckhtml->load($pagination[0]);
|
||
|
$as =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("a");
|
||
|
|
||
|
foreach($as as $a){
|
||
|
|
||
|
if($a["innerHTML"] == "Next"){
|
||
|
|
||
|
$out["npt"] = $this->nextpage->store(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$a["attributes"]["href"]
|
||
|
),
|
||
|
"web"
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $out;
|
||
|
}
|
||
|
|
||
|
public function news($get){
|
||
|
|
||
|
$search = $get["s"];
|
||
|
|
||
|
if(strlen($search) === 0){
|
||
|
|
||
|
throw new Exception("Search term is empty!");
|
||
|
}
|
||
|
|
||
|
$out = [
|
||
|
"status" => "ok",
|
||
|
"npt" => null,
|
||
|
"news" => []
|
||
|
];
|
||
|
|
||
|
try{
|
||
|
$html =
|
||
|
$this->get(
|
||
|
"https://www.mojeek.com/search",
|
||
|
[
|
||
|
"q" => $search,
|
||
|
"fmt" => "news"
|
||
|
]
|
||
|
);
|
||
|
}catch(Exception $error){
|
||
|
|
||
|
throw new Exception("Failed to get HTML");
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
$handle = fopen("scraper/mojeek.html", "r");
|
||
|
$html = fread($handle, filesize("scraper/mojeek.html"));
|
||
|
fclose($handle);*/
|
||
|
|
||
|
/*
|
||
|
Get big, standard and smaller nodes
|
||
|
*/
|
||
|
foreach(
|
||
|
[
|
||
|
"results-extended",
|
||
|
"results-standard"
|
||
|
]
|
||
|
as $categoryname
|
||
|
){
|
||
|
|
||
|
$this->fuckhtml->load($html);
|
||
|
|
||
|
$categories =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName(
|
||
|
$categoryname,
|
||
|
"ul"
|
||
|
);
|
||
|
|
||
|
foreach($categories as $category){
|
||
|
|
||
|
$this->fuckhtml->load($category);
|
||
|
|
||
|
$nodes =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("li");
|
||
|
|
||
|
foreach($nodes as $node){
|
||
|
|
||
|
$data = [
|
||
|
"title" => null,
|
||
|
"author" => null,
|
||
|
"description" => null,
|
||
|
"date" => null,
|
||
|
"thumb" =>
|
||
|
[
|
||
|
"url" => null,
|
||
|
"ratio" => null
|
||
|
],
|
||
|
"url" => null
|
||
|
];
|
||
|
|
||
|
/*
|
||
|
Parse the results
|
||
|
*/
|
||
|
$this->fuckhtml->load($node);
|
||
|
|
||
|
// get title + url
|
||
|
$a =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("a")[0];
|
||
|
|
||
|
$data["title"] =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$a["attributes"]["title"]
|
||
|
);
|
||
|
|
||
|
$data["url"] =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$a["attributes"]["href"]
|
||
|
);
|
||
|
|
||
|
// get image
|
||
|
$image =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("img");
|
||
|
|
||
|
if(count($image) !== 0){
|
||
|
|
||
|
$data["thumb"] = [
|
||
|
"url" =>
|
||
|
urldecode(
|
||
|
str_replace(
|
||
|
"/image?img=",
|
||
|
"",
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$image[0]["attributes"]["src"]
|
||
|
)
|
||
|
)
|
||
|
),
|
||
|
"ratio" => "16:9"
|
||
|
];
|
||
|
}
|
||
|
|
||
|
// get description
|
||
|
$description =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("s", "p");
|
||
|
|
||
|
if(count($description) !== 0){
|
||
|
|
||
|
$data["description"] =
|
||
|
$this->titledots(
|
||
|
$this->fuckhtml
|
||
|
->getTextContent(
|
||
|
$description[0]
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
// get date + time
|
||
|
$date =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName(
|
||
|
"date",
|
||
|
"p"
|
||
|
);
|
||
|
|
||
|
$i =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByClassName("i", "p");
|
||
|
|
||
|
if(count($date) !== 0){
|
||
|
|
||
|
// we're inside a big node
|
||
|
$data["date"] = strtotime($date[0]["innerHTML"]);
|
||
|
|
||
|
if(count($i) !== 0){
|
||
|
|
||
|
$this->fuckhtml->load($i[0]);
|
||
|
|
||
|
$a =
|
||
|
$this->fuckhtml
|
||
|
->getElementsByTagName("a");
|
||
|
|
||
|
if(count($a) !== 0){
|
||
|
|
||
|
$data["author"] =
|
||
|
$this->fuckhtml
|
||
|
->getTextContent($a[0]);
|
||
|
}
|
||
|
}
|
||
|
}else{
|
||
|
|
||
|
// we're inside a small node
|
||
|
if(count($i) !== 0){
|
||
|
|
||
|
$i =
|
||
|
explode(
|
||
|
" - ",
|
||
|
$this->fuckhtml
|
||
|
->getTextContent($i[0])
|
||
|
);
|
||
|
|
||
|
$data["date"] = strtotime(array_pop($i));
|
||
|
$data["author"] = implode(" - ", $i);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$out["news"][] = $data;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $out;
|
||
|
}
|
||
|
|
||
|
private function titledots($title){
|
||
|
|
||
|
return trim($title, ". \t\n\r\0\x0B");
|
||
|
}
|
||
|
}
|
||
|
|