diff --git a/lib/frontend.php b/lib/frontend.php index 13bbfab..8f2f20b 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -646,16 +646,13 @@ class frontend{ "qwant" => "Qwant", "ghostery" => "Ghostery", "yep" => "Yep", - "greppr" => "Greppr", - "crowdview" => "Crowdview", "mwmbl" => "Mwmbl", "mojeek" => "Mojeek", "baidu" => "Baidu", "coccoc" => "Cốc Cốc", "solofield" => "Solofield", "marginalia" => "Marginalia", - "wiby" => "wiby", - "curlie" => "Curlie" + "wiby" => "wiby" ] ]; break; diff --git a/scraper/crowdview.php b/scraper/crowdview.php deleted file mode 100644 index 8fb267b..0000000 --- a/scraper/crowdview.php +++ /dev/null @@ -1,145 +0,0 @@ -backend = new backend("crowdview"); - - include "lib/fuckhtml.php"; - $this->fuckhtml = new fuckhtml(); - } - - public function getfilters($page){ - - return []; - } - - private function get($proxy, $url, $get = []){ - - $curlproc = curl_init(); - - if($get !== []){ - $get = http_build_query($get); - $url .= "?" . $get; - } - - curl_setopt($curlproc, CURLOPT_URL, $url); - - curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding - curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: " . config::USER_AGENT, - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "DNT: 1", - "Connection: keep-alive", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1"] - ); - - curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); - curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); - curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); - - $this->backend->assign_proxy($curlproc, $proxy); - - $data = curl_exec($curlproc); - - if(curl_errno($curlproc)){ - - throw new Exception(curl_error($curlproc)); - } - - curl_close($curlproc); - return $data; - } - - public function web($get){ - - $search = $get["s"]; - if(strlen($search) === 0){ - - throw new Exception("Search term is empty!"); - } - - $proxy = $this->backend->get_ip(); - - try{ - $json = $this->get( - $proxy, - "https://crowdview-next-js.onrender.com/api/search-v3", - [ - "query" => $search - ] - ); - }catch(Exception $error){ - - throw new Exception("Failed to fetch JSON"); - } - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - $json = json_decode($json, true); - - if($json === NULL){ - - throw new Exception("Failed to decode JSON"); - } - - foreach($json["results"] as $item){ - - $description = explode("", $item["snippet"], 2); - - $out["web"][] = [ - "title" => $this->sanitize($item["title"]), - "description" => $this->sanitize($description[1]), - "url" => $item["link"], - "date" => strtotime($description[0]), - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - } - - return $out; - } - - private function sanitize($html){ - - return - trim( - $this->fuckhtml - ->getTextContent( - html_entity_decode( - $html - ) - ), - ". " - ); - } -} diff --git a/scraper/curlie.php b/scraper/curlie.php deleted file mode 100644 index 61a8eb2..0000000 --- a/scraper/curlie.php +++ /dev/null @@ -1,309 +0,0 @@ -backend = new backend("curlie"); - - include "lib/fuckhtml.php"; - $this->fuckhtml = new fuckhtml(); - } - - public function getfilters($page){ - - if($page != "web"){ - - return []; - } - - return [ - "lang" => [ - "display" => "Language", - "option" => [ - "any" => "Any language", - "en" => "English", - "de" => "German", - "fr" => "French", - "ja" => "Japanese", - "it" => "Italian", - "es" => "Spanish", - "ru" => "Russian", - "nl" => "Dutch", - "pl" => "Polish", - "tr" => "Turkish", - "da" => "Danish", - "sv" => "Swedish", - "no" => "Norwegian", - "is" => "Icelandic", - "fo" => "Faroese", - "fi" => "Finnish", - "et" => "Estonian", - "lt" => "Lithuanian", - "lv" => "Latvian", - "cy" => "Welsh", - "ga" => "Irish", - "gd" => "Scottish Gaelic", - "br" => "Breton", - "fy" => "Frisian", - "frr" => "North Frisian", - "gem" => "Saterland Frisian", - "lb" => "Luxembourgish", - "rm" => "Romansh", - "pt" => "Portuguese", - "ca" => "Catalan", - "gl" => "Galician", - "eu" => "Basque", - "ast" => "Asturian", - "an" => "Aragonese", - "fur" => "Friulan", - "sc" => "Sardinian", - "scn" => "Sicilian", - "oc" => "Occitan", - "be" => "Belarusian", - "cs" => "Czech", - "hu" => "Hungarian", - "sk" => "Slovak", - "uk" => "Ukrainian", - "csb" => "Kashubian", - "tt" => "Tatar", - "ba" => "Bashkir", - "os" => "Ossetian", - "sl" => "Slovene", - "sr" => "Serbian", - "hr" => "Croatian", - "bs" => "Bosnian", - "bg" => "Bulgarian", - "sq" => "Albanian", - "ro" => "Romanian", - "mk" => "Macedonian", - "el" => "Greek", - "iw" => "Hebrew", - "fa" => "Persian", - "ar" => "Arabic", - "ku" => "Kurdish", - "az" => "Azerbaijani", - "hy" => "Armenian", - "af" => "Afrikaans", - "sw" => "Kiswahili", - "uz" => "Uzbek", - "kk" => "Kazakh", - "ky" => "Kyrgyz", - "tg" => "Tajik", - "tk" => "Turkmen", - "ug" => "Uyghurche", - "hi" => "Hindi", - "si" => "Sinhalese", - "gu" => "Gujarati", - "ur" => "Urdu", - "mr" => "Marathi", - "pa" => "Punjabi", - "bn" => "Bengali", - "ta" => "Tamil", - "te" => "Telugu", - "kn" => "Kannada", - "zh_CN" => "Chinese Simplified", - "zh_TW" => "Chinese Traditional", - "ko" => "Korean", - "cfr" => "Taiwanese", - "th" => "Thai", - "vi" => "Vietnamese", - "in" => "Indonesian", - "ms" => "Malay", - "tl" => "Tagalog", - "eo" => "Esperanto", - "ia" => "Interlingua", - "la" => "Latin" - ] - ] - ]; - } - - private function get($proxy, $url, $get = []){ - - $curlproc = curl_init(); - - if($get !== []){ - $get = http_build_query($get); - $url .= "?" . $get; - } - - curl_setopt($curlproc, CURLOPT_URL, $url); - - curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding - curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: " . config::USER_AGENT, - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "DNT: 1", - "Connection: keep-alive", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1"] - ); - - curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); - curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); - curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); - - $this->backend->assign_proxy($curlproc, $proxy); - - $data = curl_exec($curlproc); - - if(curl_errno($curlproc)){ - - throw new Exception(curl_error($curlproc)); - } - - curl_close($curlproc); - return $data; - } - - public function web($get){ - - if($get["npt"]){ - - [$query, $proxy] = $this->backend->get($get["npt"], "web"); - - try{ - $html = $this->get( - $proxy, - "https://curlie.org/" . $query, - [] - ); - }catch(Exception $error){ - - throw new Exception("Failed to fetch search page"); - } - - }else{ - $proxy = $this->backend->get_ip(); - - $query = [ - "q" => $get["s"], - "start" => 0, - "stime" => 92452189 // ? - ]; - - if($get["lang"] !== "any"){ - - $query["lang"] = $get["lang"]; - } - - try{ - $html = $this->get( - $proxy, - "https://curlie.org/search", - $query - ); - }catch(Exception $error){ - - throw new Exception("Failed to fetch search page"); - } - } - - $this->fuckhtml->load($html); - - $nextpage = - $this->fuckhtml - ->getElementsByClassName( - "next-page", - "a" - ); - - if(count($nextpage) !== 0){ - - $nextpage = - $this->backend->store( - $nextpage[0]["attributes"]["href"], - "web", - $proxy - ); - }else{ - - $nextpage = null; - } - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => $nextpage, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - $items = - $this->fuckhtml - ->getElementsByClassName( - "site-item", - "div" - ); - - foreach($items as $item){ - - $this->fuckhtml->load($item); - - $a = - $this->fuckhtml - ->getElementsByAttributeValue( - "target", - "_blank", - "a" - )[0]; - - $description = - $this->fuckhtml - ->getElementsByClassName("site-descr"); - - if(count($description) !== 0){ - - $description = - $this->fuckhtml - ->getTextContent( - $description[0] - ); - }else{ - - $description = null; - } - - $out["web"][] = [ - "title" => - $this->fuckhtml - ->getTextContent( - $a - ), - "description" => $description, - "url" => - $this->fuckhtml - ->getTextContent( - $a["attributes"]["href"] - ), - "date" => null, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - } - - return $out; - } -} diff --git a/scraper/greppr.php b/scraper/greppr.php deleted file mode 100644 index 3d8b517..0000000 --- a/scraper/greppr.php +++ /dev/null @@ -1,452 +0,0 @@ -backend = new backend("greppr"); - - include "lib/fuckhtml.php"; - $this->fuckhtml = new fuckhtml(); - } - - public function getfilters($page){ - - return []; - } - - private function get($proxy, $url, $get = [], $cookies = [], $post = false){ - - $curlproc = curl_init(); - - curl_setopt($curlproc, CURLOPT_URL, $url); - - curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding - - $cookie = []; - foreach($cookies as $k => $v){ - - $cookie[] = "{$k}={$v}"; - } - - $cookie = implode("; ", $cookie); - - if($post === false){ - - if($get !== []){ - $get = http_build_query($get); - $url .= "?" . $get; - } - - if($cookie == ""){ - - curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: " . config::USER_AGENT, - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "DNT: 1", - "Connection: keep-alive", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1"] - ); - }else{ - - curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: " . config::USER_AGENT, - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip, deflate, br, zstd", - "DNT: 1", - "Sec-GPC: 1", - "Connection: keep-alive", - "Referer: https://greppr.org/search", - "Cookie: {$cookie}", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: same-origin", - "Sec-Fetch-User: ?1", - "Priority: u=0, i"] - ); - } - }else{ - - $get = http_build_query($get); - - curl_setopt($curlproc, CURLOPT_POST, true); - curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); - - curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: " . config::USER_AGENT, - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip, deflate, br, zstd", - "Content-Type: application/x-www-form-urlencoded", - "Content-Length: " . strlen($get), - "Origin: https://greppr.org", - "DNT: 1", - "Sec-GPC: 1", - "Connection: keep-alive", - "Referer: https://greppr.org/", - "Cookie: {$cookie}", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: same-origin", - "Sec-Fetch-User: ?1", - "Priority: u=0, i"] - ); - } - - curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); - curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); - curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); - - $this->backend->assign_proxy($curlproc, $proxy); - - $headers = []; - - curl_setopt( - $curlproc, - CURLOPT_HEADERFUNCTION, - function($curlproc, $header) use (&$headers){ - - $len = strlen($header); - $header = explode(':', $header, 2); - - if(count($header) < 2){ - - // ignore invalid headers - return $len; - } - - $headers[strtolower(trim($header[0]))][] = trim($header[1]); - - return $len; - } - ); - - $data = curl_exec($curlproc); - - if(curl_errno($curlproc)){ - - throw new Exception(curl_error($curlproc)); - } - - curl_close($curlproc); - - return [ - "headers" => $headers, - "data" => $data - ]; - } - - public function web($get, $first_attempt = true){ - - if($get["npt"]){ - - [$q, $proxy] = $this->backend->get($get["npt"], "web"); - - $tokens = json_decode($q, true); - - // - // Get paginated page - // - try{ - - $html = $this->get( - $proxy, - "https://greppr.org" . $tokens["get"], - [], - $tokens["cookies"], - false - ); - }catch(Exception $error){ - - throw new Exception("Failed to fetch search page"); - } - - }else{ - - $search = $get["s"]; - if(strlen($search) === 0){ - - throw new Exception("Search term is empty!"); - } - - $proxy = $this->backend->get_ip(); - - // - // get token - // - try{ - - $html = - $this->get( - $proxy, - "https://greppr.org", - [], - [], - false - ); - }catch(Exception $error){ - - throw new Exception("Failed to fetch homepage"); - } - - // - // Parse token - // - $this->fuckhtml->load($html["data"]); - - $tokens = [ - "req" => null, - "data" => null, - "cookies" => null - ]; - - $inputs = - $this->fuckhtml - ->getElementsByTagName( - "input" - ); - - foreach($inputs as $input){ - - if(!isset($input["attributes"]["name"])){ - - continue; - } - - if( - isset($input["attributes"]["value"]) && - !empty($input["attributes"]["value"]) - ){ - - $tokens - ["data"] - [$this->fuckhtml - ->getTextContent( - $input["attributes"]["name"] - )] = - $this->fuckhtml - ->getTextContent( - $input["attributes"]["value"] - ); - }else{ - - $tokens["req"] = - $this->fuckhtml - ->getTextContent( - $input["attributes"]["name"] - ); - } - } - - if($tokens["req"] === null){ - - throw new Exception("Failed to get request ID"); - } - - if(isset($html["headers"]["set-cookie"])){ - - foreach($html["headers"]["set-cookie"] as $cookie){ - - if( - preg_match( - '/([^=]+)=([^;]+)/', - $cookie, - $matches - ) - ){ - - $tokens["cookies"][$matches[1]] = $matches[2]; - } - } - } - - // - // Get initial search page - // - $tokens_req = $tokens["data"]; - $tokens_req[$tokens["req"]] = $search; - - try{ - - $html = $this->get( - $proxy, - "https://greppr.org/search", - $tokens_req, - $tokens["cookies"], - true - ); - }catch(Exception $error){ - - throw new Exception("Failed to fetch search page"); - } - } - - //$html = file_get_contents("scraper/greppr.html"); - //$this->fuckhtml->load($html); - $this->fuckhtml->load($html["data"]); - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - // get results for later - $results = - $this->fuckhtml - ->getElementsByClassName( - "result", - "div" - ); - - // check for next page - $next_elem = - $this->fuckhtml - ->getElementsByClassName( - "pagination", - "ul" - ); - - if(count($next_elem) !== 0){ - - $this->fuckhtml->load($next_elem[0]); - - $as = - $this->fuckhtml - ->getElementsByClassName( - "page-link", - "a" - ); - - $break = false; - foreach($as as $a){ - - if($break === true){ - - $out["npt"] = - $this->backend->store( - json_encode([ - "get" => - $this->fuckhtml - ->getTextContent( - $a["attributes"]["href"] - ), - "cookies" => $tokens["cookies"] - ]), - "web", - $proxy - ); - break; - } - - if($a["attributes"]["href"] == "#"){ - - $break = true; - } - } - } - - // scrape results - foreach($results as $result){ - - $this->fuckhtml->load($result); - - $a = - $this->fuckhtml - ->getElementsByTagName( - "a" - )[0]; - - $description = - $this->fuckhtml - ->getElementsByClassName( - "highlightedDesc", - "p" - ); - - if(count($description) === 0){ - - $description = null; - }else{ - - $description = - $this->limitstrlen( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - } - - $date = - $this->fuckhtml - ->getElementsByTagName( - "p" - ); - - $date = - strtotime( - explode( - ":", - $this->fuckhtml - ->getTextContent( - $date[count($date) - 1]["innerHTML"] - ) - )[1] - ); - - $out["web"][] = [ - "title" => - $this->fuckhtml - ->getTextContent( - $a["innerHTML"] - ), - "description" => $description, - "url" => - $this->fuckhtml - ->getTextContent( - $a["attributes"]["href"] - ), - "date" => $date, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - } - - return $out; - } - - private function limitstrlen($text){ - - return explode("\n", wordwrap($text, 300, "\n"))[0]; - } -}