From 2976c0a6a4c3ef72784a92867edd7df32ef67d3a Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 24 Mar 2024 22:31:19 -0400 Subject: [PATCH] fag protection --- data/config.php | 42 +++++++++++++++++++++++++-------- docs/configure.md | 5 +++- images.php | 2 +- lib/backend.php | 2 ++ lib/frontend.php | 58 ++++++++++++++++++++++++++++------------------ lib/fuckhtml.php | 4 ++-- music.php | 2 +- news.php | 2 +- scraper/google.php | 1 + videos.php | 2 +- web.php | 2 +- 11 files changed, 82 insertions(+), 40 deletions(-) diff --git a/data/config.php b/data/config.php index 1b10d14..cc1961a 100644 --- a/data/config.php +++ b/data/config.php @@ -23,17 +23,13 @@ class config{ // Enable the API? const API_ENABLED = true; - // Bot protection - // 4get.ca has been hit with 500k bot reqs every single day for months - // you probably want to enable this if your instance is public... - // 0 = disabled - // 1 = ask for image captcha (requires imagemagick v6 or higher) - // @TODO: 2 = invite only (users needs a pass) - const BOT_PROTECTION = 0; + // + // BOT PROTECTION + // - // Maximal number of searches per captcha key/pass issued. Counter gets - // reset on every APCU cache clear (should happen once a day) - const MAX_SEARCHES = 100; + // 0 = disabled, 1 = ask for image captcha, @TODO: 2 = invite only (users needs a pass) + // VERY useful against a targetted attack + const BOT_PROTECTION = 0; // if BOT_PROTECTION is set to 1, specify the available datasets here // images should be named from 1.png to X.png, and be 100x100 in size @@ -45,6 +41,32 @@ class config{ // ["minecraft", 848] ]; + // If this regex expression matches on the user agent, it blocks the request + // Not useful at all against a targetted attack + const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i'; + + // Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!) + // Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"]; + // Useful for blocking *some* proxies used for botting + const FILTERED_HEADER_KEYS = [ + "x-forwarded-for", + "x-via", + "forwarded-for", + "via" + ]; + + // @TODO: Portscan the user for open proxies before allowing a connection, block user if any are found + // Requires the nmap package + const NMAP_PROXY_CHECK = false; + + // @TODO: Make IP blacklist public under /api/v1/blacklist endpoint ? + const PUBLIC_IP_BLACKLIST = true; + + // Maximal number of searches per captcha key/pass issued. Counter gets + // reset on every APCU cache clear (should happen once a day). + // Only useful when BOT_PROTECTION is NOT set to 0 + const MAX_SEARCHES = 100; + // List of domains that point to your servers. Include your tor/i2p // addresses here! Must be a valid URL. Won't affect links placed on // the homepage. diff --git a/docs/configure.md b/docs/configure.md index fc8b0bb..c1b6afe 100644 --- a/docs/configure.md +++ b/docs/configure.md @@ -8,6 +8,9 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png` 4. The captcha font is located in `data/fonts/captcha.ttf` +## Robots.txt +Make sure you configure this right to optimize your search engine presence! Head over to `/robots.txt` and change the 4get.ca domain to your own domain. + ## Server listing To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it. @@ -32,4 +35,4 @@ If you see spammy entries in your instances list, simply remove the instance fro Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection! ### Important! -If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. +If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. Hopefully this tip can save you 3 hours of your life! diff --git a/images.php b/images.php index 3c4df15..99fc9d6 100644 --- a/images.php +++ b/images.php @@ -29,7 +29,7 @@ try{ }catch(Exception $error){ - $frontend->drawscrapererror($error->getMessage(), $get, "images"); + $frontend->drawscrapererror($error->getMessage(), $get, "images", $payload["timetaken"]); } if(count($results["image"]) === 0){ diff --git a/lib/backend.php b/lib/backend.php index c76a0be..8033633 100644 --- a/lib/backend.php +++ b/lib/backend.php @@ -32,6 +32,8 @@ class backend{ $proxylist = array_values($proxylist); + echo $proxy_index_raw % count($proxylist); + return $proxylist[$proxy_index_raw % count($proxylist)]; } diff --git a/lib/frontend.php b/lib/frontend.php index d82dba2..68398b5 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -44,7 +44,7 @@ class frontend{ $replacements["timetaken"] !== null ){ - $replacements["timetaken"] = '
Took ' . substr(microtime(true) - $replacements["timetaken"], 0, 4) . 's
'; + $replacements["timetaken"] = '
Took ' . number_format(microtime(true) - $replacements["timetaken"], 2) . 's
'; } $handle = fopen("template/{$template}", "r"); @@ -84,29 +84,54 @@ class frontend{ "filters" => $this->generatehtmlfilters($filters, $get) ]); + $headers_raw = getallheaders(); + $header_keys = []; + $user_agent = ""; + $bad_header = false; + + foreach($headers_raw as $headerkey => $headervalue){ + + $headerkey = strtolower($headerkey); + if($headerkey == "user-agent"){ + + $user_agent = $headervalue; + continue; + } + + // check header key + if(in_array($headerkey, config::FILTERED_HEADER_KEYS)){ + + $bad_header = true; + break; + } + } + if( preg_match( - '/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i', - $_SERVER["HTTP_USER_AGENT"] - ) + config::HEADER_REGEX, + $user_agent + ) || + $bad_header === true ){ // bot detected !! apcu_inc("captcha_gen"); + $null = null; $this->drawerror( "Tshh, blocked!", - 'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running your own 4get instance.', + 'Your browser, IP or IP range has been blocked from this 4get instance. If this is an error, please contact the administrator.', + microtime(true) ); die(); } } - public function drawerror($title, $error){ + public function drawerror($title, $error, $timetaken){ echo $this->load("search.html", [ - "timetaken" => null, + "timetaken" => $timetaken, "class" => "", "right-left" => "", "right-right" => "", @@ -119,7 +144,7 @@ class frontend{ die(); } - public function drawscrapererror($error, $get, $target){ + public function drawscrapererror($error, $get, $target, $timetaken){ $this->drawerror( "Shit", @@ -131,7 +156,8 @@ class frontend{ '
  • Remove keywords that could cause errors
  • ' . '
  • buildquery($get, false) . '">Try your search on another 4get instance
  • ' . '
    ' . - 'If the error persists, please contact the administrator.' + 'If the error persists, please contact the administrator.', + $timetaken ); } @@ -483,10 +509,6 @@ class frontend{ $archives[] = "warosu.org"; break; - case "cm": - $archives[] = "boards.fireden.net"; - break; - case "f": $archives[] = "archive.4plebs.org"; break; @@ -503,12 +525,10 @@ class frontend{ break; case "v": - $archives[] = "boards.fireden.net"; $archives[] = "arch.b4k.co"; break; case "vg": - $archives[] = "boards.fireden.net"; $archives[] = "arch.b4k.co"; break; @@ -579,7 +599,6 @@ class frontend{ break; case "sci": - $archives[] = "boards.fireden.net"; $archives[] = "warosu.org"; $archives[] = "eientei.xyz"; break; @@ -614,7 +633,6 @@ class frontend{ break; case "ic": - $archives[] = "boards.fireden.net"; $archives[] = "warosu.org"; break; @@ -741,10 +759,6 @@ class frontend{ $archives[] = "desuarchive.org"; break; - case "y": - $archives[] = "boards.fireden.net"; - break; - case "t": $archives[] = "archiveofsins.com"; break; @@ -802,7 +816,7 @@ class frontend{ $payload .= 'goGoogle cache' . 'arArchive.org' . - 'arArchive.is' . + 'arArchive.is' . 'ghGhostarchive' . 'biBing cache' . 'meMegalodon' . diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php index ed1252c..6895fbf 100644 --- a/lib/fuckhtml.php +++ b/lib/fuckhtml.php @@ -73,7 +73,7 @@ class fuckhtml{ $attributes = []; preg_match_all( - '/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/', + '/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i', $starting_tags[2][$i][0], $regex_attributes ); @@ -88,7 +88,7 @@ class fuckhtml{ continue; } - $attributes[$regex_attributes[1][$k]] = + $attributes[strtolower($regex_attributes[1][$k])] = trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00"); } diff --git a/music.php b/music.php index 0162d4c..c49fc08 100644 --- a/music.php +++ b/music.php @@ -31,7 +31,7 @@ try{ }catch(Exception $error){ - $frontend->drawscrapererror($error->getMessage(), $get, "music"); + $frontend->drawscrapererror($error->getMessage(), $get, "music", $payload["timetaken"]); } $categories = [ diff --git a/news.php b/news.php index 3d5030a..b205819 100644 --- a/news.php +++ b/news.php @@ -31,7 +31,7 @@ try{ }catch(Exception $error){ - $frontend->drawscrapererror($error->getMessage(), $get, "news"); + $frontend->drawscrapererror($error->getMessage(), $get, "news", $payload["timetaken"]); } /* diff --git a/scraper/google.php b/scraper/google.php index 1485436..77aaa3c 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -654,6 +654,7 @@ class google{ throw new Exception("Failed to get HTML"); } + //$html = file_get_contents("scraper/google.html"); } diff --git a/videos.php b/videos.php index 868e530..35f3cbb 100644 --- a/videos.php +++ b/videos.php @@ -31,7 +31,7 @@ try{ }catch(Exception $error){ - $frontend->drawscrapererror($error->getMessage(), $get, "videos"); + $frontend->drawscrapererror($error->getMessage(), $get, "videos", $payload["timetaken"]); } $categories = [ diff --git a/web.php b/web.php index 42675d9..ff1fc13 100644 --- a/web.php +++ b/web.php @@ -31,7 +31,7 @@ try{ }catch(Exception $error){ - $frontend->drawscrapererror($error->getMessage(), $get, "web"); + $frontend->drawscrapererror($error->getMessage(), $get, "web", $payload["timetaken"]); } /*