fag protection
This commit is contained in:
parent
81502d4721
commit
2976c0a6a4
|
@ -23,17 +23,13 @@ class config{
|
|||
// Enable the API?
|
||||
const API_ENABLED = true;
|
||||
|
||||
// Bot protection
|
||||
// 4get.ca has been hit with 500k bot reqs every single day for months
|
||||
// you probably want to enable this if your instance is public...
|
||||
// 0 = disabled
|
||||
// 1 = ask for image captcha (requires imagemagick v6 or higher)
|
||||
// @TODO: 2 = invite only (users needs a pass)
|
||||
const BOT_PROTECTION = 0;
|
||||
//
|
||||
// BOT PROTECTION
|
||||
//
|
||||
|
||||
// Maximal number of searches per captcha key/pass issued. Counter gets
|
||||
// reset on every APCU cache clear (should happen once a day)
|
||||
const MAX_SEARCHES = 100;
|
||||
// 0 = disabled, 1 = ask for image captcha, @TODO: 2 = invite only (users needs a pass)
|
||||
// VERY useful against a targetted attack
|
||||
const BOT_PROTECTION = 0;
|
||||
|
||||
// if BOT_PROTECTION is set to 1, specify the available datasets here
|
||||
// images should be named from 1.png to X.png, and be 100x100 in size
|
||||
|
@ -45,6 +41,32 @@ class config{
|
|||
// ["minecraft", 848]
|
||||
];
|
||||
|
||||
// If this regex expression matches on the user agent, it blocks the request
|
||||
// Not useful at all against a targetted attack
|
||||
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i';
|
||||
|
||||
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
|
||||
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
|
||||
// Useful for blocking *some* proxies used for botting
|
||||
const FILTERED_HEADER_KEYS = [
|
||||
"x-forwarded-for",
|
||||
"x-via",
|
||||
"forwarded-for",
|
||||
"via"
|
||||
];
|
||||
|
||||
// @TODO: Portscan the user for open proxies before allowing a connection, block user if any are found
|
||||
// Requires the nmap package
|
||||
const NMAP_PROXY_CHECK = false;
|
||||
|
||||
// @TODO: Make IP blacklist public under /api/v1/blacklist endpoint ?
|
||||
const PUBLIC_IP_BLACKLIST = true;
|
||||
|
||||
// Maximal number of searches per captcha key/pass issued. Counter gets
|
||||
// reset on every APCU cache clear (should happen once a day).
|
||||
// Only useful when BOT_PROTECTION is NOT set to 0
|
||||
const MAX_SEARCHES = 100;
|
||||
|
||||
// List of domains that point to your servers. Include your tor/i2p
|
||||
// addresses here! Must be a valid URL. Won't affect links placed on
|
||||
// the homepage.
|
||||
|
|
|
@ -8,6 +8,9 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
|
|||
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
|
||||
4. The captcha font is located in `data/fonts/captcha.ttf`
|
||||
|
||||
## Robots.txt
|
||||
Make sure you configure this right to optimize your search engine presence! Head over to `/robots.txt` and change the 4get.ca domain to your own domain.
|
||||
|
||||
## Server listing
|
||||
To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it.
|
||||
|
||||
|
@ -32,4 +35,4 @@ If you see spammy entries in your instances list, simply remove the instance fro
|
|||
Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection!
|
||||
|
||||
### Important!
|
||||
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead.
|
||||
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. Hopefully this tip can save you 3 hours of your life!
|
||||
|
|
|
@ -29,7 +29,7 @@ try{
|
|||
|
||||
}catch(Exception $error){
|
||||
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "images");
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "images", $payload["timetaken"]);
|
||||
}
|
||||
|
||||
if(count($results["image"]) === 0){
|
||||
|
|
|
@ -32,6 +32,8 @@ class backend{
|
|||
|
||||
$proxylist = array_values($proxylist);
|
||||
|
||||
echo $proxy_index_raw % count($proxylist);
|
||||
|
||||
return $proxylist[$proxy_index_raw % count($proxylist)];
|
||||
}
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ class frontend{
|
|||
$replacements["timetaken"] !== null
|
||||
){
|
||||
|
||||
$replacements["timetaken"] = '<div class="timetaken">Took ' . substr(microtime(true) - $replacements["timetaken"], 0, 4) . 's</div>';
|
||||
$replacements["timetaken"] = '<div class="timetaken">Took ' . number_format(microtime(true) - $replacements["timetaken"], 2) . 's</div>';
|
||||
}
|
||||
|
||||
$handle = fopen("template/{$template}", "r");
|
||||
|
@ -84,29 +84,54 @@ class frontend{
|
|||
"filters" => $this->generatehtmlfilters($filters, $get)
|
||||
]);
|
||||
|
||||
$headers_raw = getallheaders();
|
||||
$header_keys = [];
|
||||
$user_agent = "";
|
||||
$bad_header = false;
|
||||
|
||||
foreach($headers_raw as $headerkey => $headervalue){
|
||||
|
||||
$headerkey = strtolower($headerkey);
|
||||
if($headerkey == "user-agent"){
|
||||
|
||||
$user_agent = $headervalue;
|
||||
continue;
|
||||
}
|
||||
|
||||
// check header key
|
||||
if(in_array($headerkey, config::FILTERED_HEADER_KEYS)){
|
||||
|
||||
$bad_header = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(
|
||||
preg_match(
|
||||
'/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i',
|
||||
$_SERVER["HTTP_USER_AGENT"]
|
||||
)
|
||||
config::HEADER_REGEX,
|
||||
$user_agent
|
||||
) ||
|
||||
$bad_header === true
|
||||
){
|
||||
|
||||
// bot detected !!
|
||||
apcu_inc("captcha_gen");
|
||||
|
||||
$null = null;
|
||||
$this->drawerror(
|
||||
"Tshh, blocked!",
|
||||
'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running <a href="https://git.lolcat.ca/lolcat/4get" rel="noreferrer nofollow">your own 4get instance</a>.',
|
||||
'Your browser, IP or IP range has been blocked from this 4get instance. If this is an error, please <a href="/about">contact the administrator</a>.',
|
||||
microtime(true)
|
||||
);
|
||||
die();
|
||||
}
|
||||
}
|
||||
|
||||
public function drawerror($title, $error){
|
||||
public function drawerror($title, $error, $timetaken){
|
||||
|
||||
echo
|
||||
$this->load("search.html", [
|
||||
"timetaken" => null,
|
||||
"timetaken" => $timetaken,
|
||||
"class" => "",
|
||||
"right-left" => "",
|
||||
"right-right" => "",
|
||||
|
@ -119,7 +144,7 @@ class frontend{
|
|||
die();
|
||||
}
|
||||
|
||||
public function drawscrapererror($error, $get, $target){
|
||||
public function drawscrapererror($error, $get, $target, $timetaken){
|
||||
|
||||
$this->drawerror(
|
||||
"Shit",
|
||||
|
@ -131,7 +156,8 @@ class frontend{
|
|||
'<li>Remove keywords that could cause errors</li>' .
|
||||
'<li><a href="/instances?target=' . $target . "&" . $this->buildquery($get, false) . '">Try your search on another 4get instance</a></li>' .
|
||||
'</ul><br>' .
|
||||
'If the error persists, please <a href="/about">contact the administrator</a>.'
|
||||
'If the error persists, please <a href="/about">contact the administrator</a>.',
|
||||
$timetaken
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -483,10 +509,6 @@ class frontend{
|
|||
$archives[] = "warosu.org";
|
||||
break;
|
||||
|
||||
case "cm":
|
||||
$archives[] = "boards.fireden.net";
|
||||
break;
|
||||
|
||||
case "f":
|
||||
$archives[] = "archive.4plebs.org";
|
||||
break;
|
||||
|
@ -503,12 +525,10 @@ class frontend{
|
|||
break;
|
||||
|
||||
case "v":
|
||||
$archives[] = "boards.fireden.net";
|
||||
$archives[] = "arch.b4k.co";
|
||||
break;
|
||||
|
||||
case "vg":
|
||||
$archives[] = "boards.fireden.net";
|
||||
$archives[] = "arch.b4k.co";
|
||||
break;
|
||||
|
||||
|
@ -579,7 +599,6 @@ class frontend{
|
|||
break;
|
||||
|
||||
case "sci":
|
||||
$archives[] = "boards.fireden.net";
|
||||
$archives[] = "warosu.org";
|
||||
$archives[] = "eientei.xyz";
|
||||
break;
|
||||
|
@ -614,7 +633,6 @@ class frontend{
|
|||
break;
|
||||
|
||||
case "ic":
|
||||
$archives[] = "boards.fireden.net";
|
||||
$archives[] = "warosu.org";
|
||||
break;
|
||||
|
||||
|
@ -741,10 +759,6 @@ class frontend{
|
|||
$archives[] = "desuarchive.org";
|
||||
break;
|
||||
|
||||
case "y":
|
||||
$archives[] = "boards.fireden.net";
|
||||
break;
|
||||
|
||||
case "t":
|
||||
$archives[] = "archiveofsins.com";
|
||||
break;
|
||||
|
@ -802,7 +816,7 @@ class frontend{
|
|||
$payload .=
|
||||
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
|
||||
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
|
||||
'<a href="https://archive.is/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
|
||||
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
|
||||
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
|
||||
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
|
||||
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .
|
||||
|
|
|
@ -73,7 +73,7 @@ class fuckhtml{
|
|||
$attributes = [];
|
||||
|
||||
preg_match_all(
|
||||
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/',
|
||||
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i',
|
||||
$starting_tags[2][$i][0],
|
||||
$regex_attributes
|
||||
);
|
||||
|
@ -88,7 +88,7 @@ class fuckhtml{
|
|||
continue;
|
||||
}
|
||||
|
||||
$attributes[$regex_attributes[1][$k]] =
|
||||
$attributes[strtolower($regex_attributes[1][$k])] =
|
||||
trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ try{
|
|||
|
||||
}catch(Exception $error){
|
||||
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "music");
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "music", $payload["timetaken"]);
|
||||
}
|
||||
|
||||
$categories = [
|
||||
|
|
2
news.php
2
news.php
|
@ -31,7 +31,7 @@ try{
|
|||
|
||||
}catch(Exception $error){
|
||||
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "news");
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "news", $payload["timetaken"]);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -654,6 +654,7 @@ class google{
|
|||
|
||||
throw new Exception("Failed to get HTML");
|
||||
}
|
||||
|
||||
//$html = file_get_contents("scraper/google.html");
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ try{
|
|||
|
||||
}catch(Exception $error){
|
||||
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "videos");
|
||||
$frontend->drawscrapererror($error->getMessage(), $get, "videos", $payload["timetaken"]);
|
||||
}
|
||||
|
||||
$categories = [
|
||||
|
|
Loading…
Reference in New Issue