fuckhtml = new fuckhtml(); include "lib/backend.php"; $this->backend = new backend("mojeek"); } public function getfilters($page){ switch($page){ case "web": return [ "focus" => [ "display" => "Focus", "option" => [ "any" => "No focus", "blogs" => "Blogs", "Dictionary" => "Dictionary", "Recipes" => "Recipes", "Time" => "Time", "Weather" => "Weather" ] ], "lang" => [ "display" => "Language", "option" => [ "any" => "Any language", "af" => "Afrikaans", "sq" => "Albanian", "an" => "Aragonese", "ay" => "Aymara", "bi" => "Bislama", "br" => "Breton", "ca" => "Catalan", "kw" => "Cornish", "co" => "Corsican", "hr" => "Croatian", "da" => "Danish", "nl" => "Dutch", "dz" => "Dzongkha", "en" => "English", "fj" => "Fijian", "fi" => "Finnish", "fr" => "French", "gd" => "Gaelic", "gl" => "Galician", "de" => "German", "ht" => "Haitian", "io" => "Ido", "id" => "Indonesian", "ia" => "Interlingua", "ie" => "Interlingue", "ga" => "Irish", "it" => "Italian", "rw" => "Kinyarwanda", "la" => "Latin", "li" => "Limburgish", "lb" => "Luxembourgish", "no" => "Norwegian", "nb" => "Norwegian Bokmål", "nn" => "Norwegian Nynorsk", "oc" => "Occitan (post 1500)", "pl" => "Polish", "pt" => "Portuguese", "rm" => "Romansh", "rn" => "Rundi", "sg" => "Sango", "so" => "Somali", "es" => "Spanish", "sw" => "Swahili", "ss" => "Swati", "sv" => "Swedish", "ty" => "Tahitian", "to" => "Tonga (Tonga Islands)", "ts" => "Tsonga", "vo" => "Volapük", "wa" => "Walloon", "cy" => "Welsh", "xh" => "Xhosa", "zu" => "Zulu" ] ], "country" => [ "display" => "Country", "option" => [ "any" => "No location bias", "af" => "Afghanistan", "ax" => "Åland Islands", "al" => "Albania", "dz" => "Algeria", "as" => "American Samoa", "ad" => "Andorra", "ao" => "Angola", "ai" => "Anguilla", "aq" => "Antarctica", "ag" => "Antigua and Barbuda", "ar" => "Argentina", "am" => "Armenia", "aw" => "Aruba", "au" => "Australia", "at" => "Austria", "az" => "Azerbaijan", "bs" => "Bahamas", "bh" => "Bahrain", "bd" => "Bangladesh", "bb" => "Barbados", "by" => "Belarus", "be" => "Belgium", "bz" => "Belize", "bj" => "Benin", "bm" => "Bermuda", "bt" => "Bhutan", "bo" => "Bolivia (Plurinational State of)", "bq" => "Bonaire, Sint Eustatius and Saba", "ba" => "Bosnia and Herzegovina", "bw" => "Botswana", "bv" => "Bouvet Island", "br" => "Brazil", "io" => "British Indian Ocean Territory", "bn" => "Brunei Darussalam", "bg" => "Bulgaria", "bf" => "Burkina Faso", "bi" => "Burundi", "cv" => "Cabo Verde", "kh" => "Cambodia", "cm" => "Cameroon", "ca" => "Canada", "ky" => "Cayman Islands", "cf" => "Central African Republic", "td" => "Chad", "cl" => "Chile", "cn" => "China", "cx" => "Christmas Island", "cc" => "Cocos (Keeling) Islands", "co" => "Colombia", "km" => "Comoros", "cg" => "Congo", "cd" => "Congo (Democratic Republic of the)", "ck" => "Cook Islands", "cr" => "Costa Rica", "ci" => "Côte d'Ivoire", "hr" => "Croatia", "cu" => "Cuba", "cw" => "Curaçao", "cy" => "Cyprus", "cz" => "Czechia", "dk" => "Denmark", "dj" => "Djibouti", "dm" => "Dominica", "do" => "Dominican Republic", "ec" => "Ecuador", "eg" => "Egypt", "sv" => "El Salvador", "gq" => "Equatorial Guinea", "er" => "Eritrea", "ee" => "Estonia", "et" => "Ethiopia", "fk" => "Falkland Islands (Malvinas)", "fo" => "Faroe Islands", "fj" => "Fiji", "fi" => "Finland", "fr" => "France", "gf" => "French Guiana", "pf" => "French Polynesia", "tf" => "French Southern Territories", "ga" => "Gabon", "gm" => "Gambia", "ge" => "Georgia", "de" => "Germany", "gh" => "Ghana", "gi" => "Gibraltar", "gr" => "Greece", "gl" => "Greenland", "gd" => "Grenada", "gp" => "Guadeloupe", "gu" => "Guam", "gt" => "Guatemala", "gg" => "Guernsey", "gn" => "Guinea", "gw" => "Guinea-Bissau", "gy" => "Guyana", "ht" => "Haiti", "hm" => "Heard Island and McDonald Islands", "va" => "Holy See", "hn" => "Honduras", "hk" => "Hong Kong", "hu" => "Hungary", "is" => "Iceland", "in" => "India", "id" => "Indonesia", "ir" => "Iran (Islamic Republic of)", "iq" => "Iraq", "ie" => "Ireland", "im" => "Isle of Man", "il" => "Israel", "it" => "Italy", "jm" => "Jamaica", "jp" => "Japan", "je" => "Jersey", "jo" => "Jordan", "kz" => "Kazakhstan", "ke" => "Kenya", "ki" => "Kiribati", "kp" => "Korea (Democratic People's Republic of)", "kr" => "Korea (Republic of)", "kw" => "Kuwait", "kg" => "Kyrgyzstan", "la" => "Lao People's Democratic Republic", "lv" => "Latvia", "lb" => "Lebanon", "ls" => "Lesotho", "lr" => "Liberia", "ly" => "Libya", "li" => "Liechtenstein", "lt" => "Lithuania", "lu" => "Luxembourg", "mo" => "Macao", "mk" => "Macedonia (the former Yugoslav Republic of)", "mg" => "Madagascar", "mw" => "Malawi", "my" => "Malaysia", "mv" => "Maldives", "ml" => "Mali", "mt" => "Malta", "mh" => "Marshall Islands", "mq" => "Martinique", "mr" => "Mauritania", "mu" => "Mauritius", "yt" => "Mayotte", "mx" => "Mexico", "fm" => "Micronesia (Federated States of)", "md" => "Moldova (Republic of)", "mc" => "Monaco", "mn" => "Mongolia", "me" => "Montenegro", "ms" => "Montserrat", "ma" => "Morocco", "mz" => "Mozambique", "mm" => "Myanmar", "na" => "Namibia", "nr" => "Nauru", "np" => "Nepal", "nl" => "Netherlands", "nc" => "New Caledonia", "nz" => "New Zealand", "ni" => "Nicaragua", "ne" => "Niger", "ng" => "Nigeria", "nu" => "Niue", "nf" => "Norfolk Island", "mp" => "Northern Mariana Islands", "no" => "Norway", "om" => "Oman", "pk" => "Pakistan", "pw" => "Palau", "ps" => "Palestine, State of", "pa" => "Panama", "pg" => "Papua New Guinea", "py" => "Paraguay", "pe" => "Peru", "ph" => "Philippines", "pn" => "Pitcairn", "pl" => "Poland", "pt" => "Portugal", "pr" => "Puerto Rico", "qa" => "Qatar", "re" => "Réunion", "ro" => "Romania", "ru" => "Russian Federation", "rw" => "Rwanda", "bl" => "Saint Barthélemy", "sh" => "Saint Helena, Ascension and Tristan da Cunha", "kn" => "Saint Kitts and Nevis", "lc" => "Saint Lucia", "mf" => "Saint Martin (French part)", "pm" => "Saint Pierre and Miquelon", "vc" => "Saint Vincent and the Grenadines", "ws" => "Samoa", "sm" => "San Marino", "st" => "Sao Tome and Principe", "sa" => "Saudi Arabia", "sn" => "Senegal", "rs" => "Serbia", "sc" => "Seychelles", "sl" => "Sierra Leone", "sg" => "Singapore", "sx" => "Sint Maarten (Dutch part)", "sk" => "Slovakia", "si" => "Slovenia", "sb" => "Solomon Islands", "so" => "Somalia", "za" => "South Africa", "gs" => "South Georgia and South Sandwich Islands", "ss" => "South Sudan", "es" => "Spain", "lk" => "Sri Lanka", "sd" => "Sudan", "sr" => "Suriname", "sj" => "Svalbard and Jan Mayen", "sz" => "Swaziland", "se" => "Sweden", "ch" => "Switzerland", "sy" => "Syrian Arab Republic", "tw" => "Taiwan", "tj" => "Tajikistan", "tz" => "Tanzania, United Republic of", "th" => "Thailand", "tl" => "Timor-Leste", "tg" => "Togo", "tk" => "Tokelau", "to" => "Tonga", "tt" => "Trinidad and Tobago", "tn" => "Tunisia", "tr" => "Turkey", "tm" => "Turkmenistan", "tc" => "Turks and Caicos Islands", "tv" => "Tuvalu", "ug" => "Uganda", "ua" => "Ukraine", "ae" => "United Arab Emirates", "gb" => "United Kingdom", "us" => "United States of America", "um" => "United States Minor Outlying Islands", "uy" => "Uruguay", "uz" => "Uzbekistan", "vu" => "Vanuatu", "ve" => "Venezuela (Bolivarian Republic of)", "vn" => "Viet Nam", "vg" => "Virgin Islands (British)", "vi" => "Virgin Islands (U.S.)", "wf" => "Wallis and Futuna", "eh" => "Western Sahara", "ye" => "Yemen", "zm" => "Zambia", "zw" => "Zimbabwe" ] ], "region" => [ "display" => "Region", "option" => [ "any" => "Any region", "eu" => "European Union", "de" => "Germany", "fr" => "France", "uk" => "United Kingdom" ] ], "domain" => [ "display" => "Results per domain", "option" => [ "1" => "1 result", "2" => "2 results", "3" => "3 results", "4" => "4 results", "5" => "5 results", "10" => "10 results", "0" => "Unlimited", ] ] ]; break; case "news": return []; } } private function get($proxy, $url, $get = []){ $headers = [ "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1" ]; $curlproc = curl_init(); if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } curl_close($curlproc); return $data; } public function web($get){ if($get["npt"]){ [$token, $proxy] = $this->backend->get($get["npt"], "web"); try{ $html = $this->get( $proxy, "https://www.mojeek.com" . $token, [] ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } }else{ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $proxy = $this->backend->get_ip(); $lang = $get["lang"]; $country = $get["country"]; $region = $get["region"]; $domain = $get["domain"]; $focus = $get["focus"]; $params = [ "q" => $search, "t" => 20, // number of results/page "tn" => 7, // number of news results/page "date" => 1, // show date "tlen" => 128, // max length of title "dlen" => 511, // max length of description "arc" => ($country == "any" ? "none" : $country) // location. don't use autodetect! ]; switch($focus){ case "any": break; case "blogs": $params["fmt"] = "sst"; $params["sst"] = "1"; break; default: $params["foc_t"] = $focus; break; } if($lang != "any"){ $params["lb"] = $lang; } if($region != "any"){ $params["reg"] = $region; } if($domain != "1"){ $params["si"] = $domain; } try{ $html = $this->get( $proxy, "https://www.mojeek.com/search", $params ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } /* $handle = fopen("scraper/mojeek.html", "r"); $html = fread($handle, filesize("scraper/mojeek.html")); fclose($handle);*/ } $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; $this->fuckhtml->load($html); $results = $this->fuckhtml ->getElementsByClassName("results-standard", "ul"); if(count($results) === 0){ return $out; } /* Get all search result divs */ foreach($results as $container){ $this->fuckhtml->load($container); $results = $this->fuckhtml ->getElementsByTagName("li"); foreach($results as $result){ $data = [ "title" => null, "description" => null, "url" => null, "date" => null, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], "sublink" => [], "table" => [] ]; $this->fuckhtml->load($result); $title = $this->fuckhtml ->getElementsByClassName("title", "a")[0]; $data["title"] = html_entity_decode( $this->fuckhtml ->getTextContent( $title["innerHTML"] ) ); $data["url"] = html_entity_decode( $this->fuckhtml ->getTextContent( $title["attributes"]["href"] ) ); $description = $this->fuckhtml ->getElementsByClassName( "s", "p" ); if(count($description) !== 0){ $data["description"] = $this->titledots( html_entity_decode( $this->fuckhtml ->getTextContent( $description[0] ) ) ); } $date = $this->fuckhtml ->getElementsByClassName( "mdate", "span" ); if(count($date) !== 0){ $data["date"] = strtotime( $this->fuckhtml ->getTextContent( $date[0] ) ); } $out["web"][] = $data; } } /* Get instant answers */ $this->fuckhtml->load($html); $infoboxes = $this->fuckhtml ->getElementsByClassName( "infobox infobox-top", "div" ); foreach($infoboxes as $infobox){ $answer = [ "title" => null, "description" => [], "url" => null, "thumb" => null, "table" => [], "sublink" => [] ]; // load first part with title + short definition $infobox_html = explode( "
", $infobox["innerHTML"] ); $this->fuckhtml->load($infobox_html[0]); // title $answer["title"] = $this->fuckhtml ->getTextContent( $this->fuckhtml ->getElementsByTagName("h1")[0] ); // short definition $definition = $this->fuckhtml ->getElementsByTagName( "p" ); if(count($definition) !== 0){ $answer["description"][] = [ "type" => "quote", "value" => $this->fuckhtml ->getTextContent( $definition[0] ) ]; } // get thumbnail, if it exists $this->fuckhtml->load($infobox_html[1]); $thumb = $this->fuckhtml ->getElementsByClassName("float-right", "img"); if(count($thumb) !== 0){ preg_match( '/\/image\?img=([^&]+)/i', $thumb[0]["attributes"]["src"], $thumb ); if(count($thumb) === 2){ $answer["thumb"] = $this->fuckhtml ->getTextContent( $thumb[1] ); } } // get description $ps = $this->fuckhtml ->getElementsByTagName("p"); $first_tag = true; foreach($ps as $p){ $this->fuckhtml->load($p); if( preg_match( '/^\s*/i', $p["innerHTML"] ) ){ /* Parse table */ $strong = $this->fuckhtml ->getElementsByTagName("strong")[0]; $p["innerHTML"] = str_replace($strong["innerHTML"], "", $p["innerHTML"]); $strong = preg_replace( '/:$/', "", ucfirst( $this->fuckhtml ->getTextContent( $strong ) ) ); $answer["table"][trim($strong)] = trim( $this->fuckhtml ->getTextContent( $p ) ); continue; } $as = $this->fuckhtml ->getElementsByClassName("svg-icon"); if(count($as) !== 0){ /* Parse websites */ foreach($as as $a){ $answer["sublink"][ ucfirst(explode(" ", $a["attributes"]["class"], 2)[1]) ] = $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ); } continue; } /* Parse text content */ $tags = $this->fuckhtml ->getElementsByTagName("*"); $i = 0; foreach($tags as $tag){ $c = count($answer["description"]); // remove tag from innerHTML $p["innerHTML"] = explode($tag["outerHTML"], $p["innerHTML"], 2); if(count($p["innerHTML"]) === 2){ if( $i === 0 && $c !== 0 && $answer["description"][$c - 1]["type"] == "link" ){ $append = "\n\n"; }else{ $append = ""; } if($p["innerHTML"][0] != ""){ $answer["description"][] = [ "type" => "text", "value" => $append . trim($p["innerHTML"][0]) ]; } $p["innerHTML"] = $p["innerHTML"][1]; }else{ $p["innerHTML"] = $p["innerHTML"][0]; } switch($tag["tagName"]){ case "a": $value = $this->fuckhtml ->getTextContent( $tag ); if(strtolower($value) == "wikipedia"){ if($c !== 0){ $answer["description"][$c - 1]["value"] = rtrim($answer["description"][$c - 1]["value"]); } break; } $answer["description"][] = [ "type" => "link", "url" => $this->fuckhtml ->getTextContent( $tag["attributes"]["href"] ), "value" => $this->fuckhtml ->getTextContent( $tag ) ]; break; } $i++; } } // get URL $this->fuckhtml->load($infobox_html[2]); $answer["url"] = $this->fuckhtml ->getTextContent( $this->fuckhtml ->getElementsByTagName( "a" )[0] ["attributes"] ["href"] ); // append answer $out["answer"][] = $answer; } /* Get news */ $this->fuckhtml->load($html); $news = $this->fuckhtml ->getElementsByClassName( "results news-results", "div" ); if(count($news) !== 0){ $this->fuckhtml->load($news[0]); $lis = $this->fuckhtml ->getElementsByTagName("li"); foreach($lis as $li){ $this->fuckhtml->load($li); $a = $this->fuckhtml ->getElementsByClassName( "ob", "a" ); if(count($a) === 0){ continue; } $a = $a[0]; $date = explode( " - ", $this->fuckhtml ->getTextContent( $this->fuckhtml ->getElementsByTagName( "span" )[0] ) ); $date = strtotime( $date[count($date) - 1] ); $out["news"][] = [ "title" => html_entity_decode( $this->fuckhtml ->getTextContent( $a ) ), "description" => null, "date" => $date, "thumb" => [ "url" => null, "ratio" => null ], "url" => $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ) ]; } } /* Get next page */ $this->fuckhtml->load($html); $pagination = $this->fuckhtml ->getElementsByClassName("pagination"); if(count($pagination) !== false){ $this->fuckhtml->load($pagination[0]); $as = $this->fuckhtml ->getElementsByTagName("a"); foreach($as as $a){ if($a["innerHTML"] == "Next"){ $out["npt"] = $this->backend->store( $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ), "web", $proxy ); } } } return $out; } public function news($get){ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $out = [ "status" => "ok", "npt" => null, "news" => [] ]; try{ $html = $this->get( $this->backend->get_ip(), "https://www.mojeek.com/search", [ "q" => $search, "fmt" => "news" ] ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } /* $handle = fopen("scraper/mojeek.html", "r"); $html = fread($handle, filesize("scraper/mojeek.html")); fclose($handle); */ $this->fuckhtml->load($html); $articles = $this->fuckhtml->getElementsByTagName("article"); foreach($articles as $article){ $this->fuckhtml->load($article); $data = [ "title" => null, "author" => null, "description" => null, "date" => null, "thumb" => [ "url" => null, "ratio" => null ], "url" => null ]; $a = $this->fuckhtml->getElementsByTagName("a")[0]; $data["title"] = $this->fuckhtml ->getTextContent( $a["attributes"]["title"] ); $data["url"] = $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ); $p = $this->fuckhtml->getElementsByTagName("p"); $data["description"] = $this->titledots( $this->fuckhtml ->getTextContent( $this->fuckhtml ->getElementsByClassName( "s", $p )[0] ) ); if($data["description"] == ""){ $data["description"] = null; } // get date from big node $date = $this->fuckhtml ->getElementsByClassName( "date", $p ); if(count($date) !== 0){ $data["date"] = strtotime( $this->fuckhtml ->getTextContent( $date[0] ) ); } // grep date + author $s = $this->fuckhtml ->getElementsByClassName( "i", $p )[0]; $this->fuckhtml->load($s); $a = $this->fuckhtml ->getElementsByTagName("a"); if(count($a) !== 0){ // parse big node information $data["author"] = htmlspecialchars_decode( $this->fuckhtml ->getTextContent( $a[0]["innerHTML"] ) ); }else{ // parse smaller nodes $replace = $this->fuckhtml ->getElementsByTagName("time")[0]; $data["date"] = strtotime( $this->fuckhtml ->getTextContent( $replace ) ); $s["innerHTML"] = str_replace( $replace["outerHTML"], "", $s["innerHTML"] ); $data["author"] = preg_replace( '/ • $/', "", $s["innerHTML"] ); } $out["news"][] = $data; } return $out; } private function titledots($title){ return trim($title, ". \t\n\r\0\x0B"); } }