diff --git a/lib/anubis.php b/lib/anubis.php new file mode 100644 index 0000000..ab075ff --- /dev/null +++ b/lib/anubis.php @@ -0,0 +1,100 @@ +fuckhtml = new fuckhtml(); + } + + public function scrape($html){ + + $this->fuckhtml->load($html); + + $script = + $this->fuckhtml + ->getElementById( + "anubis_challenge", + "script" + ); + + if(count($script) === 0){ + + throw new Exception("Failed to scrape anubis challenge data"); + } + + $script = + json_decode( + $this->fuckhtml + ->getTextContent( + $script + ), + true + ); + + if($script === null){ + + throw new Exception("Failed to decode anubis challenge data"); + } + + if( + !isset($script["challenge"]) || + !isset($script["rules"]["difficulty"]) || + !is_int($script["rules"]["difficulty"]) || + !is_string($script["challenge"]) + ){ + + throw new Exception("Found invalid challenge data"); + } + + return $this->rape($script["challenge"], $script["rules"]["difficulty"]); + } + + private function is_valid_hash($hash, $difficulty){ + + for ($i=0; $i<$difficulty; $i++) { + + $index = (int)floor($i / 2); + $nibble = $i % 2; + + $byte = ord($hash[$index]); + $nibble = ($byte >> ($nibble === 0 ? 4 : 0)) & 0x0f; + + if($nibble !== 0){ + return false; + } + } + + return true; + } + + public function rape($data, $difficulty = 5){ + + $nonce = 0; + + while(true){ + + $hash_binary = hash("sha256", $data . $nonce, true); + + if($this->is_valid_hash($hash_binary, $difficulty)){ + + $hash_hex = bin2hex($hash_binary); + + return [ + "response" => $hash_hex, + //"data" => $data, + //"difficulty" => $difficulty, + "nonce" => $nonce + ]; + } + + $nonce++; + } + } +} diff --git a/scraper/marginalia.php b/scraper/marginalia.php index b9d555a..e62a485 100644 --- a/scraper/marginalia.php +++ b/scraper/marginalia.php @@ -3,7 +3,10 @@ class marginalia{ public function __construct(){ - include "lib/fuckhtml.php"; + include "lib/anubis.php"; + $this->anubis = new anubis(); + + include_once "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); include "lib/backend.php"; @@ -102,7 +105,40 @@ class marginalia{ ); } - private function get($proxy, $url, $get = []){ + private function get($proxy, $url, $get = [], $get_cookies = 1){ + + $curlproc = curl_init(); + + switch($get_cookies){ + + case 0: + $cookies = ""; + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + break; + + case 1: + $cookies = ""; + break; + + default: + $cookies = "Cookie: " . $get_cookies; + } $headers = [ "User-Agent: " . config::USER_AGENT, @@ -110,6 +146,7 @@ class marginalia{ "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", + $cookies, "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", @@ -118,8 +155,6 @@ class marginalia{ "Sec-Fetch-User: ?1" ]; - $curlproc = curl_init(); - if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; @@ -145,7 +180,19 @@ class marginalia{ throw new Exception(curl_error($curlproc)); } - curl_close($curlproc); + if($get_cookies === 0){ + + $cookie = []; + + foreach($cookies_tmp as $key => $value){ + + $cookie[] = $key . "=" . $value; + } + + curl_close($curlproc); + return implode(";", $cookie); + } + return $data; } @@ -267,6 +314,55 @@ class marginalia{ // HTML parser $proxy = $this->backend->get_ip(); + // + // Bypass anubis check + // + if(($anubis_key = apcu_fetch("marginalia_cookie")) === false){ + + try{ + $html = + $this->get( + $proxy, + "https://old-search.marginalia.nu/" + ); + }catch(Exception $error){ + + throw new Exception("Failed to get anubis challenge"); + } + + try{ + + $anubis_data = $this->anubis->scrape($html); + }catch(Exception $error){ + + throw new Exception($error); + } + + // send anubis response & get cookies + // https://old-search.marginalia.nu/.within.website/x/cmd/anubis/api/pass-challenge?response=0000018966b086834f738bacba6031028adb5aa875974ead197a8b75778baf3a&nonce=39947&redir=https%3A%2F%2Fold-search.marginalia.nu%2F&elapsedTime=1164 + + try{ + + $anubis_key = + $this->get( + $proxy, + "https://old-search.marginalia.nu/.within.website/x/cmd/anubis/api/pass-challenge", + [ + "response" => $anubis_data["response"], + "nonce" => $anubis_data["nonce"], + "redir" => "https://old-search.marginalia.nu/", + "elapsedTime" => random_int(1000, 2000) + ], + 0 + ); + }catch(Exception $error){ + + throw new Exception("Failed to submit anubis challenge"); + } + + apcu_store("marginalia_cookie", $anubis_key); + } + if($get["npt"]){ [$params, $proxy] = @@ -279,7 +375,9 @@ class marginalia{ $html = $this->get( $proxy, - "https://old-search.marginalia.nu/search?" . $params + "https://old-search.marginalia.nu/search?" . $params, + [], + $anubis_key ); }catch(Exception $error){ @@ -309,7 +407,8 @@ class marginalia{ $this->get( $proxy, "https://old-search.marginalia.nu/search", - $params + $params, + $anubis_key ); }catch(Exception $error){