From 883a650f846af7ac60d358d772aa22cbf89fd807 Mon Sep 17 00:00:00 2001 From: lolcat Date: Fri, 9 Aug 2024 10:06:08 -0400 Subject: [PATCH] implement SSL check for botretards --- api/v1/ac.php | 3 +- data/config.php | 10 ++- docs/apache2-example.md | 195 ++++++++++++++++++++++++++++++++++++++++ docs/apache2.md | 90 ++++++++----------- lib/frontend.php | 22 ++++- scraper/mwmbl.php | 2 +- scraper/pinterest.php | 97 ++++++++++++-------- settings.php | 4 + 8 files changed, 328 insertions(+), 95 deletions(-) create mode 100644 docs/apache2-example.md diff --git a/api/v1/ac.php b/api/v1/ac.php index 107f5ec..ce9b3f2 100644 --- a/api/v1/ac.php +++ b/api/v1/ac.php @@ -19,7 +19,8 @@ class autocomplete{ "marginalia" => "https://search.marginalia.nu/suggest/?partial={searchTerms}", "yt" => "https://suggestqueries-clients6.youtube.com/complete/search?client=youtube&q={searchTerms}", "sc" => "", - "startpage" => "https://www.startpage.com/suggestions?q={searchTerms}&format=opensearch&segment=startpage.defaultffx&lui=english" + "startpage" => "https://www.startpage.com/suggestions?q={searchTerms}&format=opensearch&segment=startpage.defaultffx&lui=english", + "kagi" => "https://kagi.com/api/autosuggest?q={searchTerms}" ]; /* diff --git a/data/config.php b/data/config.php index cba8b66..0d44c19 100644 --- a/data/config.php +++ b/data/config.php @@ -63,6 +63,14 @@ class config{ //"via" ]; + // Block SSL ciphers used by CLI tools used for botting + // Basically a primitive version of Cloudflare's browser integrity check + // ** If curl can still access the site (with spoofed headers), please make sure you use the new apache2 config ** + // https://git.lolcat.ca/lolcat/4get/docs/apache2.md + const DISALLOWED_SSL = [ + // "TLS_AES_256_GCM_SHA384" // used by WGET and CURL + ]; + // Maximal number of searches per captcha key/pass issued. Counter gets // reset on every APCU cache clear (should happen once a day). // Only useful when BOT_PROTECTION is NOT set to 0 @@ -111,7 +119,7 @@ class config{ // Default user agent to use for scraper requests. Sometimes ignored to get specific webpages // Changing this might break things. - const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0"; + const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0"; // Proxy pool assignments for each scraper // false = Use server's raw IP diff --git a/docs/apache2-example.md b/docs/apache2-example.md new file mode 100644 index 0000000..09f5c1d --- /dev/null +++ b/docs/apache2-example.md @@ -0,0 +1,195 @@ +# Sample Apache2 configuration +This is the apache2 configuration file used on the 4get.ca official instance, in hopes that it's useful to you! + +Looking for the apache2 guide? go here.. + +```xml + + ServerName www.4get.ca + + SSLEngine On + SSLCertificateFile /etc/letsencrypt/live/4get.ca/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/4get.ca/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/4get.ca/chain.pem + + RedirectMatch 301 ^(.*)$ https://4get.ca$1 + + + + ServerName 4get.ca + + ServerAdmin will@lolcat.ca + DocumentRoot /var/www/4get + + SSLEngine On + SSLOptions +StdEnvVars + + #ErrorLog ${APACHE_LOG_DIR}/error.log + + AddOutputFilterByType DEFLATE application/json + AddOutputFilterByType DEFLATE application/javascript + AddOutputFilterByType DEFLATE application/x-javascript + AddOutputFilterByType DEFLATE text/html + AddOutputFilterByType DEFLATE text/plain + AddOutputFilterByType DEFLATE text/css + + SSLCertificateFile /etc/letsencrypt/live/4get.ca/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/4get.ca/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/4get.ca/chain.pem + + + Options -MultiViews + AllowOverride All + Require all granted + + RewriteEngine On + RewriteCond %{REQUEST_FILENAME} !-d + RewriteCond %{REQUEST_FILENAME} !-f + RewriteRule ^([^\.]+)$ $1.php [NC,L] + + + # deny access to private resources + + Order Deny,allow + Deny from all + + + + + ServerName www.lolcat.ca + + SSLEngine On + SSLCertificateFile /etc/letsencrypt/live/4get.ca/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/4get.ca/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/4get.ca/chain.pem + + RedirectMatch 301 ^(.*)$ https://lolcat.ca$1 + + + + ServerName lolcat.ca + + ServerAdmin will@lolcat.ca + DocumentRoot /var/www/lolcat + + SSLEngine On + SSLOptions +StdEnvVars + + #ErrorLog ${APACHE_LOG_DIR}/error.log + + AddOutputFilterByType DEFLATE application/json + AddOutputFilterByType DEFLATE application/javascript + AddOutputFilterByType DEFLATE application/x-javascript + AddOutputFilterByType DEFLATE text/html + AddOutputFilterByType DEFLATE text/plain + AddOutputFilterByType DEFLATE text/css + + SSLCertificateFile /etc/letsencrypt/live/4get.ca/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/4get.ca/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/4get.ca/chain.pem + + + Options -MultiViews + AllowOverride All + Require all granted + + RewriteEngine On + RewriteCond %{REQUEST_FILENAME} !-d + RewriteCond %{REQUEST_FILENAME} !-f + RewriteRule ^([^\.]+)$ $1.php [NC,L] + + + + + ServerName www.nyym.co + + SSLEngine On + SSLCertificateFile /etc/letsencrypt/live/nyym.co/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/nyym.co/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/nyym.co/chain.pem + + RedirectMatch 301 ^(.*)$ https://nyym.co$1 + + + + ServerName nyym.co + + ServerAdmin will@lolcat.ca + DocumentRoot /var/www/nyym + + SSLEngine On + SSLOptions +StdEnvVars + + #ErrorLog ${APACHE_LOG_DIR}/error.log + + AddOutputFilterByType DEFLATE application/json + AddOutputFilterByType DEFLATE application/javascript + AddOutputFilterByType DEFLATE application/x-javascript + AddOutputFilterByType DEFLATE text/html + AddOutputFilterByType DEFLATE text/plain + AddOutputFilterByType DEFLATE text/css + + SSLCertificateFile /etc/letsencrypt/live/nyym.co/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/nyym.co/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/nyym.co/chain.pem + + + Options -MultiViews + AllowOverride All + Require all granted + + RewriteEngine On + RewriteCond %{REQUEST_FILENAME} !-d + RewriteCond %{REQUEST_FILENAME} !-f + RewriteRule ^([^\.]+)$ $1.php [NC,L] + + + + + ServerName git.lolcat.ca + + SSLEngine On + SSLOptions +StdEnvVars + + #ErrorLog ${APACHE_LOG_DIR}/error.log + + AddOutputFilterByType DEFLATE application/json + AddOutputFilterByType DEFLATE application/javascript + AddOutputFilterByType DEFLATE application/x-javascript + AddOutputFilterByType DEFLATE text/html + AddOutputFilterByType DEFLATE text/plain + AddOutputFilterByType DEFLATE text/css + + SSLCertificateFile /etc/letsencrypt/live/4get.ca/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/4get.ca/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/4get.ca/chain.pem + + ProxyPreserveHost On + ProxyRequests off + AllowEncodedSlashes NoDecode + ProxyPass / http://localhost:3000/ nocanon + + + + ServerName live.lolcat.ca + + ServerAdmin will@lolcat.ca + DocumentRoot /var/www/live + + SSLEngine On + SSLOptions +StdEnvVars + + #ErrorLog ${APACHE_LOG_DIR}/error.log + + AddOutputFilterByType DEFLATE application/json + AddOutputFilterByType DEFLATE application/javascript + AddOutputFilterByType DEFLATE application/x-javascript + AddOutputFilterByType DEFLATE text/html + AddOutputFilterByType DEFLATE text/plain + AddOutputFilterByType DEFLATE text/css + + SSLCertificateFile /etc/letsencrypt/live/4get.ca/fullchain.pem + SSLCertificateKeyFile /etc/letsencrypt/live/4get.ca/privkey.pem + SSLCertificateChainFile /etc/letsencrypt/live/4get.ca/chain.pem + +``` diff --git a/docs/apache2.md b/docs/apache2.md index e746a7e..1e79327 100644 --- a/docs/apache2.md +++ b/docs/apache2.md @@ -74,7 +74,7 @@ Now, edit the following file: `/etc/apache2/sites-available/000-default.conf`, r DocumentRoot /var/www/4get - Options +MultiViews + Options -MultiViews RewriteEngine On RewriteCond %{REQUEST_FILENAME} !-d RewriteCond %{REQUEST_FILENAME} !-f @@ -92,47 +92,56 @@ To make the above snippet work, please refer to our - SSLOptions +StdEnvVars - - - SSLOptions +StdEnvVars - - + AddOutputFilterByType DEFLATE application/json AddOutputFilterByType DEFLATE application/javascript AddOutputFilterByType DEFLATE application/x-javascript AddOutputFilterByType DEFLATE text/html AddOutputFilterByType DEFLATE text/plain AddOutputFilterByType DEFLATE text/css - + SSLCertificateFile /etc/letsencrypt/live/4get.ca/fullchain.pem SSLCertificateKeyFile /etc/letsencrypt/live/4get.ca/privkey.pem - -``` - -This ruleset tells apache2 where 4get is located (`/var/www/4get`), ensures that `4get.ca/settings` resolves to `4get.ca/settings.php` internally and that we deny access to `/data/*`, which may contain files you might want to keep private. -```xml - - ServerName 4get.ca - - DocumentRoot /var/www/4get - - Options +MultiViews - RewriteEngine On - RewriteCond %{REQUEST_FILENAME} !-d - RewriteCond %{REQUEST_FILENAME} !-f - RewriteRule ^([^\.]+)$ $1.php [NC,L] + SSLCertificateChainFile /etc/letsencrypt/live/4get.ca/chain.pem + + + Options -MultiViews + AllowOverride All + Require all granted + + RewriteEngine On + RewriteCond %{REQUEST_FILENAME} !-d + RewriteCond %{REQUEST_FILENAME} !-f + RewriteRule ^([^\.]+)$ $1.php [NC,L] + # deny access to private resources @@ -142,28 +151,7 @@ This ruleset tells apache2 where 4get is located (`/var/www/4get`), ensures that ``` -Don't forget to specify your other services here! Here's an example of a ruleset I use for `lolcat.ca`: -```xml - - ServerName lolcat.ca - - DocumentRoot /var/www/lolcat - - Options +MultiViews - RewriteEngine On - RewriteCond %{REQUEST_FILENAME} !-d - RewriteCond %{REQUEST_FILENAME} !-f - RewriteRule ^([^\.]+)$ $1.php [NC,L] - -``` - -... Alongside with it's redirect rules. -```xml - - ServerName www.lolcat.ca - RedirectMatch 301 ^(.*)$ https://lolcat.ca$1 - -``` +By default, the first rule dictates where traffic should be redirected to in case the client specifies an unknown domain name. Don't forget your webserver's other rules! For a complete real-world example, please check out my real-world config file I use on 4get.ca. ## security.conf If you enabled the `headers` module, you can head over to `/etc/apache2/conf-enabled/security.conf` and edit: diff --git a/lib/frontend.php b/lib/frontend.php index ef55f4d..71ed6d7 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -89,6 +89,7 @@ class frontend{ $user_agent = ""; $bad_header = false; + // block bots that present X-Forwarded-For, Via, etc foreach($headers_raw as $headerkey => $headervalue){ $headerkey = strtolower($headerkey); @@ -106,12 +107,27 @@ class frontend{ } } + // SSL check + $bad_ssl = false; if( + isset($_SERVER["https"]) && + $_SERVER["https"] == "on" && + isset($_SERVER["SSL_CIPHER"]) && + in_array($_SERVER["SSL_CIPHER"], config::FILTERED_HEADER_KEYS) + ){ + + $bad_ssl = true; + } + + if( + $bad_header === true || + $bad_ssl === true || + $user_agent == "" || + // user agent check preg_match( config::HEADER_REGEX, $user_agent - ) || - $bad_header === true + ) ){ // bot detected !! @@ -1306,7 +1322,7 @@ class frontend{ return htmlspecialchars($image); } - return "/proxy?i=" . urlencode($image) . "&s=" . $format; + return "https://4get.ca/proxy?i=" . urlencode($image) . "&s=" . $format; } public function htmlnextpage($gets, $npt, $page){ diff --git a/scraper/mwmbl.php b/scraper/mwmbl.php index f2f8b70..631b90c 100644 --- a/scraper/mwmbl.php +++ b/scraper/mwmbl.php @@ -52,7 +52,7 @@ class mwmbl{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); - curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->backend->assign_proxy($curlproc, $proxy); diff --git a/scraper/pinterest.php b/scraper/pinterest.php index f3c4439..3787f77 100644 --- a/scraper/pinterest.php +++ b/scraper/pinterest.php @@ -13,7 +13,7 @@ class pinterest{ return []; } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); @@ -45,7 +45,7 @@ class pinterest{ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); - $this->proxy->assign_proxy($curlproc); + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -60,45 +60,63 @@ class pinterest{ public function image($get){ - $search = $get["s"]; - - $out = [ - "status" => "ok", - "npt" => null, - "image" => [] - ]; - - $filter = [ - "source_url" => "/search/pins/?q=" . urlencode($search), - "rs" => "typed", - "data" => - json_encode( - [ - "options" => [ - "article" => null, - "applied_filters" => null, - "appliedProductFilters" => "---", - "auto_correction_disabled" => false, - "corpus" => null, - "customized_rerank_type" => null, - "filters" => null, - "query" => $search, - "query_pin_sigs" => null, - "redux_normalize_feed" => true, - "rs" => "typed", - "scope" => "pins", // pins, boards, videos, - "source_id" => null - ], - "context" => [] - ] - ), - "_" => substr(str_replace(".", "", (string)microtime(true)), 0, -1) - ]; + if($get["npt"]){ + + // @TODO + // post data for next page + $data = [ + "source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed", + "data" => + json_encode( + [ + // {"options":{"applied_filters":null,"appliedProductFilters":"---","article":null,"auto_correction_disabled":false,"corpus":null,"customized_rerank_type":null,"domains":null,"filters":null,"journey_depth":null,"page_size":null,"price_max":null,"price_min":null,"query_pin_sigs":null,"query":"higurashi","redux_normalize_feed":true,"rs":"typed","scope":"pins","selected_one_bar_modules":null,"source_id":null,"source_module_id":null,"top_pin_id":null,"bookmarks":["Y2JVSG81V2sxcmNHRlpWM1J5VFVad1ZsWlVRbXhpVmtreVZsZHpOV0pIU2tkV2FscFhVbXhhVkZreU1WSmtNREZWVjIxR1RrMXNTbEJXYlhSaFVtMVdjMVZ1U2xaaWEzQnpXVlJPVTJWV1pISlhhM1JYVm10V05sVldVbE5XVjBwMVVXMUdWVll6VFhoVWJYaFhWMVp3Ums1V1RsTmlSbGt5Vm10YWFtVkdWbkpOU0dSUFZsZG9XRmxzWkc5VlZscHlWbGhrYkdKR1NubFdWelZQWVVaYWRHVkVRbFppUmtwVVZrUktWMlJIVWtWV2JHaHBVakZLU0Zkc1pEUmtNVnBZVW10b2FsSXdXbkJXYlRWRFpHeGFSMWRzVG1oaGVrWllXV3RvVTFVeFpFaFZiRUpoVm5wRk1GbHFSbXRYVjA1R1YyczFWMVpHV2pSWFZtaDNVakZrY2sxWVRsaGlhM0JXV1ZSR1MyRkdiRlZTYm1SVVVteHdXbGxWVlRGVk1VbDVWRmhrVjAxdVVuWlVhMXBTWlVaT2MxcEhSbE5TTWswMVdtdGFWMU5YU2paVmJYaFRUVmhDUjFZeU5YZFVNVkY0VjJ0b1ZXRnJOVlpVVmxwTFVURndXR042VmxOV2ExcGFXVlZWTlZVeFNYZE5WRTVYVWtWYVZGWkhNVTlXTVU1WllVWk9hR1ZyV2s1WFZ6QXhZakpPVjFWWWFHRlNWbkJRVm14U1IwMUdXWGxOVkVKVlRWWnNORll5TURWV1YwVjVWV3hDV21FeGNETmFSVnByVjFkS1IyTkhhR2xYUjJkM1ZtdGFhMlF4VVhsVGJGcE9Wa1p3YjFwWGVFdFZWbFp4VW14YWJGWnRVbHBaTUdoTFZHMUtTR1ZJYUZkV2VrWjJWMVphU21ReVJYcGpSbFpwVW10d1RGZHJVa0pPVms1SFZHNVNUbFl3V2xoVmJYUldaVVpaZUZremFGUk5hM0JYVkZaYVYyRkZNSGxWYkVKYVlrWlZlRnBGV210WFIwNUpVMnMxVTFaR1dscFdWekI0VFVaV1IxTllaR3BUUlhCb1dWUkdWbVZHVm5SbFJuQnNZbFpKTWxSVlVYaFBSVGxGV1hwR1QyVnJSVEZVVlZKT1RrVXhSVkpVUWs5bGJFVXhWRmhzZDFOR1ZsWmtNMFp0VWpGYWIxZFhjRXBsUlRGSVZWaHdUbFl4YTNoVVZWSnFUVVUxV0ZadGFFOVNSVnB6Vkd0a1drMUdiRFpUVkVaT1pXMWplRmRzVWxkaFJuQllWVlJTVDJWdFRqWlVNVkpTWlZad2NWcEhkRTlsYTFwMFZGVlNhMkpWTVZWVFZFcE9Wa1pzTmxkWE1WSk9WVEYwVlcweFVGWXdXVFJXUjNSWFYwZGFRbEJVTVRoUFJHTXhUbnBCTlUxRVRUUk5SRVV3VG5wUk5VMTVjRWhWVlhkeFprUlZlRTlFVVRKWlZHc3lUMWRSTWsxVVVUSk9iVnBvV1RKWmVrNTZXWGhPTWs1cFQwUkZNVTlFVm1sTlZGcHBUV3BTYTFsWFRtcE9SR015VG1wVk5GbHFaR2haVjFacldWUmFiVmxxWkdoYVZGWnFUa1JXT0ZSclZsaG1RVDA5fFVIbzVhRkpYZUc1WFYyUlpWVEpHYkdGNk1XWk5ha1ptVFZSR09FOUVZekZPZWtFMVRVUk5ORTFFUlRCT2VsRTFUWGx3U0ZWVmQzRm1SMWw1VFZSUk1WbDZUVEJhUjFGNVQxZFNhVnB0VlRGT1JFVXdXVlJuZVU1cVRUUk5hbU40VDBSSk1VNXFWVEZOYlZwcVdsUnJlRTFFVVhwWmVsVjNXbXBvYkU1dFJYbE9ha0Y2VDFSSk5VMTZWVEJaYWtJNFZHdFdXR1pCUFQwPXxOb25lfDg3NTcwOTAzODAxNDc0OTMqR1FMKnwzMjM3YjM3ZGNhMGU3YjYyYzYzYzAyZGJkNGU1MjdlNzMyMTExMTNlMmUyMzEyOWM2MDAzYmU1ZTlmZjkwYjAwfE5FV3w="]},"context":{}} + ] + ); + ]; + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $filter = [ + "source_url" => "/search/pins/?q=" . urlencode($search), + "rs" => "typed", + "data" => + json_encode( + [ + "options" => [ + "article" => null, + "applied_filters" => null, + "appliedProductFilters" => "---", + "auto_correction_disabled" => false, + "corpus" => null, + "customized_rerank_type" => null, + "filters" => null, + "query" => $search, + "query_pin_sigs" => null, + "redux_normalize_feed" => true, + "rs" => "typed", + "scope" => "pins", // pins, boards, videos, + "source_id" => null + ], + "context" => [] + ] + ), + "_" => substr(str_replace(".", "", (string)microtime(true)), 0, -1) + ]; + + $proxy = $this->backend->get_ip(); + } try{ $json = json_decode( $this->get( + $proxy, "https://www.pinterest.ca/resource/BaseSearchResource/get/", $filter ), @@ -115,7 +133,11 @@ class pinterest{ throw new Exception("Failed to decode JSON"); } - //print_r($json); + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; foreach( $json @@ -189,7 +211,6 @@ class pinterest{ break; case "board": - if(isset($item["cover_pin"]["image_url"])){ $image = [ diff --git a/settings.php b/settings.php index 6f99e93..046e7c7 100644 --- a/settings.php +++ b/settings.php @@ -83,6 +83,10 @@ $settings = [ "value" => "startpage", "text" => "Startpage" ], + [ + "value" => "kagi", + "text" => "Kagi" + ], [ "value" => "qwant", "text" => "Qwant"