diff --git a/data/config.php b/data/config.php index 2fd47aa..ef446c5 100644 --- a/data/config.php +++ b/data/config.php @@ -43,7 +43,7 @@ class config{ // If this regex expression matches on the user agent, it blocks the request // Not useful at all against a targetted attack - const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant/i'; + const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant|meta/i'; // Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!) // Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"]; @@ -118,7 +118,7 @@ class config{ // Default user agent to use for scraper requests. Sometimes ignored to get specific webpages // Changing this might break things. - const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0"; + const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0"; // Proxy pool assignments for each scraper // false = Use server's raw IP @@ -130,6 +130,8 @@ class config{ const PROXY_GOOGLE = false; const PROXY_GOOGLE_API = false; const PROXY_GOOGLE_CSE = false; + const PROXY_MULLVAD_GOOGLE = false; + const PROXY_MULLVAD_BRAVE = false; const PROXY_STARTPAGE = false; const PROXY_QWANT = false; const PROXY_BAIDU = false; @@ -143,6 +145,7 @@ class config{ const PROXY_WIBY = false; const PROXY_CURLIE = false; const PROXY_YT = false; // youtube + const PROXY_ARCHIVEORG = false; const PROXY_SEPIASEARCH = false; const PROXY_ODYSEE = false; const PROXY_VIMEO = false; diff --git a/scraper/mullvad.php b/scraper/mullvad.php new file mode 100644 index 0000000..c143cd5 --- /dev/null +++ b/scraper/mullvad.php @@ -0,0 +1,342 @@ +engine = $engine; + + include "lib/backend.php"; + $this->backend = new backend("mullvad_{$this->engine}"); + } + + public function getfilters($page){ + return [ + "country" => [ // &country= + "display" => "Country", + "option" => [ + "any" => "Any country", + "ar" => "Argentina", + "au" => "Australia", + "at" => "Austria", + "be" => "Belgium", + "br" => "Brazil", + "ca" => "Canada", + "cl" => "Chile", + "cn" => "China", + "dk" => "Denmark", + "fi" => "Finland", + "fr" => "France", + "de" => "Germany", + "hk" => "Hong Kong", + "in" => "India", + "id" => "Indonesia", + "it" => "Italy", + "jp" => "Japan", + "kr" => "Korea, Republic", + "my" => "Malaysia", + "mx" => "Mexico", + "nl" => "Netherlands", + "nz" => "New Zealand", + "no" => "Norway", + "ph" => "Philippines", + "pl" => "Poland", + "pt" => "Portugal", + "ru" => "Russian Federation", + "sa" => "Saudi Arabia", + "za" => "South Africa", + "es" => "Spain", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tr" => "Turkey", + "uk" => "United Kingdom", + "us" => "United States" + ] + ], + "language" => [ // &language= + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "zh-hans" => "Chinese (Simplified)", + "zh-hant" => "Chinese (Traditional)", + "hr" => "Croatian", + "cs" => "Czech", + "da" => "Danish", + "nl" => "Dutch", + "en" => "English", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "de" => "German", + "he" => "Hebrew", + "hu" => "Hungarian", + "is" => "Icelandic", + "it" => "Italian", + "jp" => "Japanese", + "ko" => "Korean", + "lv" => "Latvian", + "lt" => "Lithuanian", + "nb" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sr" => "Serbian", + "sk" => "Slovak", + "sl" => "Slovenian", + "es" => "Spanish", + "sv" => "Swedish", + "tr" => "Turkish" + ] + ], + "time" => [ // &lastUpdated= + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past day", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year" + ] + ] + ]; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: https://leta.mullvad.net/search", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: engine=brave", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "Priority: u=0", + "TE: trailers"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + $params = json_decode($params, true); + + }else{ + + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + + // generate filters + $params = [ + "q" => $get["s"], + "engine" => $this->engine, + "page" => 1 + ]; + + if($get["country"] != "any"){ + + $params["country"] = $get["country"]; + } + + if($get["language"] != "any"){ + + $params["language"] = $get["language"]; + } + + if($get["time"] != "any"){ + + $params["lastUpdated"] = $get["time"]; + } + + $proxy = $this->backend->get_ip(); + } + + try{ + $json = $this->get( + $proxy, + "https://leta.mullvad.net/search/__data.json", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(!isset($json["nodes"])){ + + throw new Exception("Mullvad did not return a nodes object"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => $nextpage, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // parse json payload + foreach($json["nodes"] as $node){ + + if(!isset($node["data"][0]["q"])){ + + // not iterating through the query object + continue; + } + + // node 0 contains pointers to what we need to iterate through + $node0 = &$node["data"][0]; + + if(!isset($node["data"][$node0["success"]])){ + + throw new Exception("Mullvad did not return a success object"); + } + + $success = &$node["data"][$node0["success"]]; + + if($success === false){ + + throw new Exception("Mullvad flagged the response as unsuccessful"); + } + + if(!isset($node["data"][$node0["items"]])){ + + throw new Exception("Mullvad did not return an items object"); + } + + $search_pointers = &$node["data"][$node0["items"]]; + + // + // Iterate over results + // + foreach($search_pointers as $pointer){ + + $pointer = &$node["data"][$pointer]; + + $link = &$node["data"][$pointer["link"]]; + $title = &$node["data"][$pointer["title"]]; + $description = &$node["data"][$pointer["snippet"]]; + + $date = null; + if($this->engine == "google"){ + + // attempt to extract date + // Jan 12, 2017 + $date_parts = explode(" ... ", $description, 2); + + if( + count($date_parts) === 2 && + strlen($date_parts[0]) < 15 + ){ + + $date = strtotime(trim($date_parts[0])); + + if($date === false){ + + $date = null; + }else{ + + $description = trim($date_parts[1]); + } + } + } + + $out["web"][] = [ + "title" => $this->titledots($title), + "description" => $this->titledots($description), + "url" => $link, + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + // + // Get nextpage + // + if(isset($node["data"][$node0["next"]])){ + + $params["page"] = (int)$node["data"][$node0["next"]]; + + $out["npt"] = + $this->backend->store( + json_encode($params), + "web", + $proxy + ); + } + } + + return $out; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } +} diff --git a/scraper/mullvad_brave.php b/scraper/mullvad_brave.php new file mode 100644 index 0000000..958733b --- /dev/null +++ b/scraper/mullvad_brave.php @@ -0,0 +1,20 @@ +mullvad = new mullvad("brave"); + } + + public function getfilters($page){ + + return $this->mullvad->getfilters($page); + } + + public function web($get){ + + return $this->mullvad->web($get); + } +} diff --git a/scraper/mullvad_google.php b/scraper/mullvad_google.php new file mode 100644 index 0000000..9804942 --- /dev/null +++ b/scraper/mullvad_google.php @@ -0,0 +1,20 @@ +mullvad = new mullvad("google"); + } + + public function getfilters($page){ + + return $this->mullvad->getfilters($page); + } + + public function web($get){ + + return $this->mullvad->web($get); + } +} diff --git a/settings.php b/settings.php index 75bc373..d680d52 100644 --- a/settings.php +++ b/settings.php @@ -125,6 +125,10 @@ $settings = [ "value" => "brave", "text" => "Brave" ], + [ + "value" => "mullvad_brave", + "text" => "Mullvad (Brave)" + ], [ "value" => "yandex", "text" => "Yandex" @@ -137,6 +141,10 @@ $settings = [ "value" => "google_cse", "text" => "Google CSE" ], + [ + "value" => "mullvad_google", + "text" => "Mullvad (Google)" + ], [ "value" => "startpage", "text" => "Startpage" @@ -177,6 +185,10 @@ $settings = [ "value" => "coccoc", "text" => "Cốc Cốc" ], + [ + "value" => "solofield", + "text" => "Solofield" + ], [ "value" => "marginalia", "text" => "Marginalia" @@ -231,6 +243,10 @@ $settings = [ "value" => "baidu", "text" => "Baidu" ], + [ + "value" => "solofield", + "text" => "Solofield" + ], [ "value" => "pinterest", "text" => "Pinterest" @@ -308,6 +324,10 @@ $settings = [ [ "value" => "coccoc", "text" => "Cốc Cốc" + ], + [ + "value" => "solofield", + "text" => "Solofield" ] ] ],