remove extremely low quality scrapers

This commit is contained in:
2026-04-26 16:22:24 -04:00
parent a54f212550
commit 9ea0372bb7
4 changed files with 1 additions and 910 deletions

View File

@@ -1,145 +0,0 @@
<?php
class crowdview{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("crowdview");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
try{
$json = $this->get(
$proxy,
"https://crowdview-next-js.onrender.com/api/search-v3",
[
"query" => $search
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$json = json_decode($json, true);
if($json === NULL){
throw new Exception("Failed to decode JSON");
}
foreach($json["results"] as $item){
$description = explode("<b>", $item["snippet"], 2);
$out["web"][] = [
"title" => $this->sanitize($item["title"]),
"description" => $this->sanitize($description[1]),
"url" => $item["link"],
"date" => strtotime($description[0]),
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function sanitize($html){
return
trim(
$this->fuckhtml
->getTextContent(
html_entity_decode(
$html
)
),
". "
);
}
}

View File

@@ -1,309 +0,0 @@
<?php
class curlie{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("curlie");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
if($page != "web"){
return [];
}
return [
"lang" => [
"display" => "Language",
"option" => [
"any" => "Any language",
"en" => "English",
"de" => "German",
"fr" => "French",
"ja" => "Japanese",
"it" => "Italian",
"es" => "Spanish",
"ru" => "Russian",
"nl" => "Dutch",
"pl" => "Polish",
"tr" => "Turkish",
"da" => "Danish",
"sv" => "Swedish",
"no" => "Norwegian",
"is" => "Icelandic",
"fo" => "Faroese",
"fi" => "Finnish",
"et" => "Estonian",
"lt" => "Lithuanian",
"lv" => "Latvian",
"cy" => "Welsh",
"ga" => "Irish",
"gd" => "Scottish Gaelic",
"br" => "Breton",
"fy" => "Frisian",
"frr" => "North Frisian",
"gem" => "Saterland Frisian",
"lb" => "Luxembourgish",
"rm" => "Romansh",
"pt" => "Portuguese",
"ca" => "Catalan",
"gl" => "Galician",
"eu" => "Basque",
"ast" => "Asturian",
"an" => "Aragonese",
"fur" => "Friulan",
"sc" => "Sardinian",
"scn" => "Sicilian",
"oc" => "Occitan",
"be" => "Belarusian",
"cs" => "Czech",
"hu" => "Hungarian",
"sk" => "Slovak",
"uk" => "Ukrainian",
"csb" => "Kashubian",
"tt" => "Tatar",
"ba" => "Bashkir",
"os" => "Ossetian",
"sl" => "Slovene",
"sr" => "Serbian",
"hr" => "Croatian",
"bs" => "Bosnian",
"bg" => "Bulgarian",
"sq" => "Albanian",
"ro" => "Romanian",
"mk" => "Macedonian",
"el" => "Greek",
"iw" => "Hebrew",
"fa" => "Persian",
"ar" => "Arabic",
"ku" => "Kurdish",
"az" => "Azerbaijani",
"hy" => "Armenian",
"af" => "Afrikaans",
"sw" => "Kiswahili",
"uz" => "Uzbek",
"kk" => "Kazakh",
"ky" => "Kyrgyz",
"tg" => "Tajik",
"tk" => "Turkmen",
"ug" => "Uyghurche",
"hi" => "Hindi",
"si" => "Sinhalese",
"gu" => "Gujarati",
"ur" => "Urdu",
"mr" => "Marathi",
"pa" => "Punjabi",
"bn" => "Bengali",
"ta" => "Tamil",
"te" => "Telugu",
"kn" => "Kannada",
"zh_CN" => "Chinese Simplified",
"zh_TW" => "Chinese Traditional",
"ko" => "Korean",
"cfr" => "Taiwanese",
"th" => "Thai",
"vi" => "Vietnamese",
"in" => "Indonesian",
"ms" => "Malay",
"tl" => "Tagalog",
"eo" => "Esperanto",
"ia" => "Interlingua",
"la" => "Latin"
]
]
];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$query, $proxy] = $this->backend->get($get["npt"], "web");
try{
$html = $this->get(
$proxy,
"https://curlie.org/" . $query,
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$proxy = $this->backend->get_ip();
$query = [
"q" => $get["s"],
"start" => 0,
"stime" => 92452189 // ?
];
if($get["lang"] !== "any"){
$query["lang"] = $get["lang"];
}
try{
$html = $this->get(
$proxy,
"https://curlie.org/search",
$query
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$this->fuckhtml->load($html);
$nextpage =
$this->fuckhtml
->getElementsByClassName(
"next-page",
"a"
);
if(count($nextpage) !== 0){
$nextpage =
$this->backend->store(
$nextpage[0]["attributes"]["href"],
"web",
$proxy
);
}else{
$nextpage = null;
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => $nextpage,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$items =
$this->fuckhtml
->getElementsByClassName(
"site-item",
"div"
);
foreach($items as $item){
$this->fuckhtml->load($item);
$a =
$this->fuckhtml
->getElementsByAttributeValue(
"target",
"_blank",
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName("site-descr");
if(count($description) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
$description = null;
}
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
}

View File

@@ -1,452 +0,0 @@
<?php
// greppr dev probably monitors 4get code, lol
// hello greppr dude, add an API you moron
class greppr{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("greppr");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = [], $cookies = [], $post = false){
$curlproc = curl_init();
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
$cookie = [];
foreach($cookies as $k => $v){
$cookie[] = "{$k}={$v}";
}
$cookie = implode("; ", $cookie);
if($post === false){
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
if($cookie == ""){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Referer: https://greppr.org/search",
"Cookie: {$cookie}",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i"]
);
}
}else{
$get = http_build_query($get);
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"Content-Type: application/x-www-form-urlencoded",
"Content-Length: " . strlen($get),
"Origin: https://greppr.org",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Referer: https://greppr.org/",
"Cookie: {$cookie}",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$headers = [];
curl_setopt(
$curlproc,
CURLOPT_HEADERFUNCTION,
function($curlproc, $header) use (&$headers){
$len = strlen($header);
$header = explode(':', $header, 2);
if(count($header) < 2){
// ignore invalid headers
return $len;
}
$headers[strtolower(trim($header[0]))][] = trim($header[1]);
return $len;
}
);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return [
"headers" => $headers,
"data" => $data
];
}
public function web($get, $first_attempt = true){
if($get["npt"]){
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$tokens = json_decode($q, true);
//
// Get paginated page
//
try{
$html = $this->get(
$proxy,
"https://greppr.org" . $tokens["get"],
[],
$tokens["cookies"],
false
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
//
// get token
//
try{
$html =
$this->get(
$proxy,
"https://greppr.org",
[],
[],
false
);
}catch(Exception $error){
throw new Exception("Failed to fetch homepage");
}
//
// Parse token
//
$this->fuckhtml->load($html["data"]);
$tokens = [
"req" => null,
"data" => null,
"cookies" => null
];
$inputs =
$this->fuckhtml
->getElementsByTagName(
"input"
);
foreach($inputs as $input){
if(!isset($input["attributes"]["name"])){
continue;
}
if(
isset($input["attributes"]["value"]) &&
!empty($input["attributes"]["value"])
){
$tokens
["data"]
[$this->fuckhtml
->getTextContent(
$input["attributes"]["name"]
)] =
$this->fuckhtml
->getTextContent(
$input["attributes"]["value"]
);
}else{
$tokens["req"] =
$this->fuckhtml
->getTextContent(
$input["attributes"]["name"]
);
}
}
if($tokens["req"] === null){
throw new Exception("Failed to get request ID");
}
if(isset($html["headers"]["set-cookie"])){
foreach($html["headers"]["set-cookie"] as $cookie){
if(
preg_match(
'/([^=]+)=([^;]+)/',
$cookie,
$matches
)
){
$tokens["cookies"][$matches[1]] = $matches[2];
}
}
}
//
// Get initial search page
//
$tokens_req = $tokens["data"];
$tokens_req[$tokens["req"]] = $search;
try{
$html = $this->get(
$proxy,
"https://greppr.org/search",
$tokens_req,
$tokens["cookies"],
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
//$html = file_get_contents("scraper/greppr.html");
//$this->fuckhtml->load($html);
$this->fuckhtml->load($html["data"]);
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// get results for later
$results =
$this->fuckhtml
->getElementsByClassName(
"result",
"div"
);
// check for next page
$next_elem =
$this->fuckhtml
->getElementsByClassName(
"pagination",
"ul"
);
if(count($next_elem) !== 0){
$this->fuckhtml->load($next_elem[0]);
$as =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$break = false;
foreach($as as $a){
if($break === true){
$out["npt"] =
$this->backend->store(
json_encode([
"get" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"cookies" => $tokens["cookies"]
]),
"web",
$proxy
);
break;
}
if($a["attributes"]["href"] == "#"){
$break = true;
}
}
}
// scrape results
foreach($results as $result){
$this->fuckhtml->load($result);
$a =
$this->fuckhtml
->getElementsByTagName(
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName(
"highlightedDesc",
"p"
);
if(count($description) === 0){
$description = null;
}else{
$description =
$this->limitstrlen(
$this->fuckhtml
->getTextContent(
$description[0]
)
);
}
$date =
$this->fuckhtml
->getElementsByTagName(
"p"
);
$date =
strtotime(
explode(
":",
$this->fuckhtml
->getTextContent(
$date[count($date) - 1]["innerHTML"]
)
)[1]
);
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a["innerHTML"]
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"date" => $date,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function limitstrlen($text){
return explode("\n", wordwrap($text, 300, "\n"))[0];
}
}