remove extremely low quality scrapers

2026-04-26 16:22:24 -04:00
parent a54f212550
commit 9ea0372bb7
4 changed files with 1 additions and 910 deletions
--- a/scraper/crowdview.php
+++ b/scraper/crowdview.php
@@ -1,145 +0,0 @@
-<?php
-
-class crowdview{
-	
-	public function __construct(){
-		
-		include "lib/backend.php";
-		$this->backend = new backend("crowdview");
-		
-		include "lib/fuckhtml.php";
-		$this->fuckhtml = new fuckhtml();
-	}
-	
-	public function getfilters($page){
-
-		return [];
-	}
-	
-	private function get($proxy, $url, $get = []){
-		
-		$curlproc = curl_init();
-		
-		if($get !== []){
-			$get = http_build_query($get);
-			$url .= "?" . $get;
-		}
-		
-		curl_setopt($curlproc, CURLOPT_URL, $url);
-		
-		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
-		curl_setopt($curlproc, CURLOPT_HTTPHEADER,
-			["User-Agent: " . config::USER_AGENT,
-			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-			"Accept-Language: en-US,en;q=0.5",
-			"Accept-Encoding: gzip",
-			"DNT: 1",
-			"Connection: keep-alive",
-			"Upgrade-Insecure-Requests: 1",
-			"Sec-Fetch-Dest: document",
-			"Sec-Fetch-Mode: navigate",
-			"Sec-Fetch-Site: none",
-			"Sec-Fetch-User: ?1"]
-		);
-		
-		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
-		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
-		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
-		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
-		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
-		
-		$this->backend->assign_proxy($curlproc, $proxy);
-		
-		$data = curl_exec($curlproc);
-		
-		if(curl_errno($curlproc)){
-			
-			throw new Exception(curl_error($curlproc));
-		}
-		
-		curl_close($curlproc);
-		return $data;
-	}
-	
-	public function web($get){
-		
-		$search = $get["s"];
-		if(strlen($search) === 0){
-			
-			throw new Exception("Search term is empty!");
-		}
-		
-		$proxy = $this->backend->get_ip();
-		
-		try{
-			$json = $this->get(
-				$proxy,
-				"https://crowdview-next-js.onrender.com/api/search-v3",
-				[
-					"query" => $search
-				]
-			);
-		}catch(Exception $error){
-			
-			throw new Exception("Failed to fetch JSON");
-		}
-		
-		$out = [
-			"status" => "ok",
-			"spelling" => [
-				"type" => "no_correction",
-				"using" => null,
-				"correction" => null
-			],
-			"npt" => null,
-			"answer" => [],
-			"web" => [],
-			"image" => [],
-			"video" => [],
-			"news" => [],
-			"related" => []
-		];
-		
-		$json = json_decode($json, true);
-		
-		if($json === NULL){
-			
-			throw new Exception("Failed to decode JSON");
-		}
-		
-		foreach($json["results"] as $item){
-			
-			$description = explode("<b>", $item["snippet"], 2);
-			
-			$out["web"][] = [
-				"title" => $this->sanitize($item["title"]),
-				"description" => $this->sanitize($description[1]),
-				"url" => $item["link"],
-				"date" => strtotime($description[0]),
-				"type" => "web",
-				"thumb" => [
-					"url" => null,
-					"ratio" => null
-				],
-				"sublink" => [],
-				"table" => []
-			];
-		}
-		
-		return $out;
-	}
-	
-	private function sanitize($html){
-		
-		return
-			trim(
-				$this->fuckhtml
-				->getTextContent(
-					html_entity_decode(
-						$html
-					)
-				),
-				". "
-			);
-	}
-}
--- a/scraper/curlie.php
+++ b/scraper/curlie.php
@@ -1,309 +0,0 @@
-<?php
-
-class curlie{
-	
-	public function __construct(){
-		
-		include "lib/backend.php";
-		$this->backend = new backend("curlie");
-		
-		include "lib/fuckhtml.php";
-		$this->fuckhtml = new fuckhtml();
-	}
-	
-	public function getfilters($page){
-		
-		if($page != "web"){
-			
-			return [];
-		}
-		
-		return [
-			"lang" => [
-				"display" => "Language",
-				"option" => [
-					"any" => "Any language",
-					"en" => "English",
-					"de" => "German",
-					"fr" => "French",
-					"ja" => "Japanese",
-					"it" => "Italian",
-					"es" => "Spanish",
-					"ru" => "Russian",
-					"nl" => "Dutch",
-					"pl" => "Polish",
-					"tr" => "Turkish",
-					"da" => "Danish",
-					"sv" => "Swedish",
-					"no" => "Norwegian",
-					"is" => "Icelandic",
-					"fo" => "Faroese",
-					"fi" => "Finnish",
-					"et" => "Estonian",
-					"lt" => "Lithuanian",
-					"lv" => "Latvian",
-					"cy" => "Welsh",
-					"ga" => "Irish",
-					"gd" => "Scottish Gaelic",
-					"br" => "Breton",
-					"fy" => "Frisian",
-					"frr" => "North Frisian",
-					"gem" => "Saterland Frisian",
-					"lb" => "Luxembourgish",
-					"rm" => "Romansh",
-					"pt" => "Portuguese",
-					"ca" => "Catalan",
-					"gl" => "Galician",
-					"eu" => "Basque",
-					"ast" => "Asturian",
-					"an" => "Aragonese",
-					"fur" => "Friulan",
-					"sc" => "Sardinian",
-					"scn" => "Sicilian",
-					"oc" => "Occitan",
-					"be" => "Belarusian",
-					"cs" => "Czech",
-					"hu" => "Hungarian",
-					"sk" => "Slovak",
-					"uk" => "Ukrainian",
-					"csb" => "Kashubian",
-					"tt" => "Tatar",
-					"ba" => "Bashkir",
-					"os" => "Ossetian",
-					"sl" => "Slovene",
-					"sr" => "Serbian",
-					"hr" => "Croatian",
-					"bs" => "Bosnian",
-					"bg" => "Bulgarian",
-					"sq" => "Albanian",
-					"ro" => "Romanian",
-					"mk" => "Macedonian",
-					"el" => "Greek",
-					"iw" => "Hebrew",
-					"fa" => "Persian",
-					"ar" => "Arabic",
-					"ku" => "Kurdish",
-					"az" => "Azerbaijani",
-					"hy" => "Armenian",
-					"af" => "Afrikaans",
-					"sw" => "Kiswahili",
-					"uz" => "Uzbek",
-					"kk" => "Kazakh",
-					"ky" => "Kyrgyz",
-					"tg" => "Tajik",
-					"tk" => "Turkmen",
-					"ug" => "Uyghurche",
-					"hi" => "Hindi",
-					"si" => "Sinhalese",
-					"gu" => "Gujarati",
-					"ur" => "Urdu",
-					"mr" => "Marathi",
-					"pa" => "Punjabi",
-					"bn" => "Bengali",
-					"ta" => "Tamil",
-					"te" => "Telugu",
-					"kn" => "Kannada",
-					"zh_CN" => "Chinese Simplified",
-					"zh_TW" => "Chinese Traditional",
-					"ko" => "Korean",
-					"cfr" => "Taiwanese",
-					"th" => "Thai",
-					"vi" => "Vietnamese",
-					"in" => "Indonesian",
-					"ms" => "Malay",
-					"tl" => "Tagalog",
-					"eo" => "Esperanto",
-					"ia" => "Interlingua",
-					"la" => "Latin"
-				]
-			]
-		];
-	}
-	
-	private function get($proxy, $url, $get = []){
-		
-		$curlproc = curl_init();
-		
-		if($get !== []){
-			$get = http_build_query($get);
-			$url .= "?" . $get;
-		}
-		
-		curl_setopt($curlproc, CURLOPT_URL, $url);
-		
-		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
-		curl_setopt($curlproc, CURLOPT_HTTPHEADER,
-			["User-Agent: " . config::USER_AGENT,
-			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-			"Accept-Language: en-US,en;q=0.5",
-			"Accept-Encoding: gzip",
-			"DNT: 1",
-			"Connection: keep-alive",
-			"Upgrade-Insecure-Requests: 1",
-			"Sec-Fetch-Dest: document",
-			"Sec-Fetch-Mode: navigate",
-			"Sec-Fetch-Site: none",
-			"Sec-Fetch-User: ?1"]
-		);
-		
-		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
-		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
-		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
-		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
-		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
-		
-		$this->backend->assign_proxy($curlproc, $proxy);
-		
-		$data = curl_exec($curlproc);
-		
-		if(curl_errno($curlproc)){
-			
-			throw new Exception(curl_error($curlproc));
-		}
-		
-		curl_close($curlproc);
-		return $data;
-	}
-	
-	public function web($get){
-		
-		if($get["npt"]){
-			
-			[$query, $proxy] = $this->backend->get($get["npt"], "web");
-			
-			try{
-				$html = $this->get(
-					$proxy,
-					"https://curlie.org/" . $query,
-					[]
-				);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to fetch search page");
-			}
-			
-		}else{
-			$proxy = $this->backend->get_ip();
-			
-			$query = [
-				"q" => $get["s"],
-				"start" => 0,
-				"stime" => 92452189 // ?
-			];
-			
-			if($get["lang"] !== "any"){
-				
-				$query["lang"] = $get["lang"];
-			}
-			
-			try{
-				$html = $this->get(
-					$proxy,
-					"https://curlie.org/search",
-					$query
-				);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to fetch search page");
-			}
-		}
-		
-		$this->fuckhtml->load($html);
-		
-		$nextpage =
-			$this->fuckhtml
-			->getElementsByClassName(
-				"next-page",
-				"a"
-			);
-		
-		if(count($nextpage) !== 0){
-			
-			$nextpage =
-				$this->backend->store(
-					$nextpage[0]["attributes"]["href"],
-					"web",
-					$proxy
-				);
-		}else{
-			
-			$nextpage = null;
-		}
-		
-		$out = [
-			"status" => "ok",
-			"spelling" => [
-				"type" => "no_correction",
-				"using" => null,
-				"correction" => null
-			],
-			"npt" => $nextpage,
-			"answer" => [],
-			"web" => [],
-			"image" => [],
-			"video" => [],
-			"news" => [],
-			"related" => []
-		];
-		
-		$items =
-			$this->fuckhtml
-			->getElementsByClassName(
-				"site-item",
-				"div"
-			);
-		
-		foreach($items as $item){
-			
-			$this->fuckhtml->load($item);
-			
-			$a =
-				$this->fuckhtml
-				->getElementsByAttributeValue(
-					"target",
-					"_blank",
-					"a"
-				)[0];
-			
-			$description =
-				$this->fuckhtml
-				->getElementsByClassName("site-descr");
-			
-			if(count($description) !== 0){
-				
-				$description =
-					$this->fuckhtml
-					->getTextContent(
-						$description[0]
-					);
-			}else{
-				
-				$description = null;
-			}
-			
-			$out["web"][] = [
-				"title" =>
-					$this->fuckhtml
-					->getTextContent(
-						$a
-					),
-				"description" => $description,
-				"url" =>
-					$this->fuckhtml
-					->getTextContent(
-						$a["attributes"]["href"]
-					),
-				"date" => null,
-				"type" => "web",
-				"thumb" => [
-					"url" => null,
-					"ratio" => null
-				],
-				"sublink" => [],
-				"table" => []
-			];
-		}
-		
-		return $out;
-	}
-}
--- a/scraper/greppr.php
+++ b/scraper/greppr.php
@@ -1,452 +0,0 @@
-<?php
-// greppr dev probably monitors 4get code, lol
-// hello greppr dude, add an API you moron
-
-class greppr{
-	
-	public function __construct(){
-		
-		include "lib/backend.php";
-		$this->backend = new backend("greppr");
-		
-		include "lib/fuckhtml.php";
-		$this->fuckhtml = new fuckhtml();
-	}
-	
-	public function getfilters($page){
-		
-		return [];
-	}
-	
-	private function get($proxy, $url, $get = [], $cookies = [], $post = false){
-		
-		$curlproc = curl_init();
-		
-		curl_setopt($curlproc, CURLOPT_URL, $url);
-		
-		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
-		
-		$cookie = [];
-		foreach($cookies as $k => $v){
-			
-			$cookie[] = "{$k}={$v}";
-		}
-		
-		$cookie = implode("; ", $cookie);
-		
-		if($post === false){
-						
-			if($get !== []){
-				$get = http_build_query($get);
-				$url .= "?" . $get;
-			}
-			
-			if($cookie == ""){
-				
-				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
-					["User-Agent: " . config::USER_AGENT,
-					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-					"Accept-Language: en-US,en;q=0.5",
-					"Accept-Encoding: gzip",
-					"DNT: 1",
-					"Connection: keep-alive",
-					"Upgrade-Insecure-Requests: 1",
-					"Sec-Fetch-Dest: document",
-					"Sec-Fetch-Mode: navigate",
-					"Sec-Fetch-Site: none",
-					"Sec-Fetch-User: ?1"]
-				);
-			}else{
-				
-				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
-					["User-Agent: " . config::USER_AGENT,
-					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-					"Accept-Language: en-US,en;q=0.5",
-					"Accept-Encoding: gzip, deflate, br, zstd",
-					"DNT: 1",
-					"Sec-GPC: 1",
-					"Connection: keep-alive",
-					"Referer: https://greppr.org/search",
-					"Cookie: {$cookie}",
-					"Upgrade-Insecure-Requests: 1",
-					"Sec-Fetch-Dest: document",
-					"Sec-Fetch-Mode: navigate",
-					"Sec-Fetch-Site: same-origin",
-					"Sec-Fetch-User: ?1",
-					"Priority: u=0, i"]
-				);
-			}
-		}else{
-			
-			$get = http_build_query($get);
-			
-			curl_setopt($curlproc, CURLOPT_POST, true);
-			curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
-			
-			curl_setopt($curlproc, CURLOPT_HTTPHEADER,
-				["User-Agent: " . config::USER_AGENT,
-				"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-				"Accept-Language: en-US,en;q=0.5",
-				"Accept-Encoding: gzip, deflate, br, zstd",
-				"Content-Type: application/x-www-form-urlencoded",
-				"Content-Length: " . strlen($get),
-				"Origin: https://greppr.org",
-				"DNT: 1",
-				"Sec-GPC: 1",
-				"Connection: keep-alive",
-				"Referer: https://greppr.org/",
-				"Cookie: {$cookie}",
-				"Upgrade-Insecure-Requests: 1",
-				"Sec-Fetch-Dest: document",
-				"Sec-Fetch-Mode: navigate",
-				"Sec-Fetch-Site: same-origin",
-				"Sec-Fetch-User: ?1",
-				"Priority: u=0, i"]
-			);
-		}
-		
-		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
-		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
-		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
-		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
-		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
-		
-		$this->backend->assign_proxy($curlproc, $proxy);
-		
-		$headers = [];
-		
-		curl_setopt(
-			$curlproc,
-			CURLOPT_HEADERFUNCTION,
-			function($curlproc, $header) use (&$headers){
-				
-				$len = strlen($header);
-				$header = explode(':', $header, 2);
-				
-				if(count($header) < 2){
-					
-					// ignore invalid headers
-					return $len;
-				}
-				
-				$headers[strtolower(trim($header[0]))][] = trim($header[1]);
-
-				return $len;
-			}
-		);
-				
-		$data = curl_exec($curlproc);
-		
-		if(curl_errno($curlproc)){
-			
-			throw new Exception(curl_error($curlproc));
-		}
-		
-		curl_close($curlproc);
-		
-		return [
-			"headers" => $headers,
-			"data" => $data
-		];
-	}
-	
-	public function web($get, $first_attempt = true){
-		
-		if($get["npt"]){
-			
-			[$q, $proxy] = $this->backend->get($get["npt"], "web");
-			
-			$tokens = json_decode($q, true);
-			
-			//
-			// Get paginated page
-			//
-			try{
-			
-				$html = $this->get(
-					$proxy,
-					"https://greppr.org" . $tokens["get"],
-					[],
-					$tokens["cookies"],
-					false
-				);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to fetch search page");
-			}
-			
-		}else{
-			
-			$search = $get["s"];
-			if(strlen($search) === 0){
-				
-				throw new Exception("Search term is empty!");
-			}
-			
-			$proxy = $this->backend->get_ip();
-			
-			//
-			// get token
-			//
-			try{
-				
-				$html =
-					$this->get(
-						$proxy,
-						"https://greppr.org",
-						[],
-						[],
-						false
-					);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to fetch homepage");
-			}
-			
-			//
-			// Parse token
-			//
-			$this->fuckhtml->load($html["data"]);
-		
-			$tokens = [
-				"req" => null,
-				"data" => null,
-				"cookies" => null
-			];
-			
-			$inputs =
-				$this->fuckhtml
-				->getElementsByTagName(
-					"input"
-				);
-				
-			foreach($inputs as $input){
-				
-				if(!isset($input["attributes"]["name"])){
-					
-					continue;
-				}
-				
-				if(
-					isset($input["attributes"]["value"]) &&
-					!empty($input["attributes"]["value"])
-				){
-					
-					$tokens
-						["data"]
-						[$this->fuckhtml
-						->getTextContent(
-							$input["attributes"]["name"]
-						)] =
-						$this->fuckhtml
-						->getTextContent(
-							$input["attributes"]["value"]
-						);
-				}else{
-					
-					$tokens["req"] =
-						$this->fuckhtml
-						->getTextContent(
-							$input["attributes"]["name"]
-						);
-				}
-			}
-			
-			if($tokens["req"] === null){
-				
-				throw new Exception("Failed to get request ID");
-			}
-			
-			if(isset($html["headers"]["set-cookie"])){
-				
-				foreach($html["headers"]["set-cookie"] as $cookie){
-					
-					if(
-						preg_match(
-							'/([^=]+)=([^;]+)/',
-							$cookie,
-							$matches
-						)
-					){
-						
-						$tokens["cookies"][$matches[1]] = $matches[2];
-					}
-				}
-			}
-			
-			//
-			// Get initial search page
-			//
-			$tokens_req = $tokens["data"];
-			$tokens_req[$tokens["req"]] = $search;
-			
-			try{
-				
-				$html = $this->get(
-					$proxy,
-					"https://greppr.org/search",
-					$tokens_req,
-					$tokens["cookies"],
-					true
-				);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to fetch search page");
-			}
-		}
-		
-		//$html = file_get_contents("scraper/greppr.html");
-		//$this->fuckhtml->load($html);
-		$this->fuckhtml->load($html["data"]);
-		
-		$out = [
-			"status" => "ok",
-			"spelling" => [
-				"type" => "no_correction",
-				"using" => null,
-				"correction" => null
-			],
-			"npt" => null,
-			"answer" => [],
-			"web" => [],
-			"image" => [],
-			"video" => [],
-			"news" => [],
-			"related" => []
-		];
-		
-		// get results for later
-		$results =
-			$this->fuckhtml
-			->getElementsByClassName(
-				"result",
-				"div"
-			);
-		
-		// check for next page
-		$next_elem =
-			$this->fuckhtml
-			->getElementsByClassName(
-				"pagination",
-				"ul"
-			);
-		
-		if(count($next_elem) !== 0){
-			
-			$this->fuckhtml->load($next_elem[0]);
-			
-			$as =
-				$this->fuckhtml
-				->getElementsByClassName(
-					"page-link",
-					"a"
-				);
-			
-			$break = false;
-			foreach($as as $a){
-				
-				if($break === true){
-					
-					$out["npt"] =
-						$this->backend->store(
-							json_encode([
-								"get" =>
-									$this->fuckhtml
-									->getTextContent(
-										$a["attributes"]["href"]
-									),
-								"cookies" => $tokens["cookies"]
-							]),
-							"web",
-							$proxy
-						);
-					break;
-				}
-				
-				if($a["attributes"]["href"] == "#"){
-					
-					$break = true;
-				}
-			}
-		}
-		
-		// scrape results
-		foreach($results as $result){
-			
-			$this->fuckhtml->load($result);
-			
-			$a =
-				$this->fuckhtml
-				->getElementsByTagName(
-					"a"
-				)[0];
-			
-			$description =
-				$this->fuckhtml
-				->getElementsByClassName(
-					"highlightedDesc",
-					"p"
-				);
-			
-			if(count($description) === 0){
-				
-				$description = null;
-			}else{
-				
-				$description =
-					$this->limitstrlen(
-						$this->fuckhtml
-						->getTextContent(
-							$description[0]
-						)
-					);
-			}
-			
-			$date =
-				$this->fuckhtml
-				->getElementsByTagName(
-					"p"
-				);
-			
-			$date =
-				strtotime(
-					explode(
-						":",
-						$this->fuckhtml
-						->getTextContent(
-							$date[count($date) - 1]["innerHTML"]
-						)
-					)[1]
-				);
-			
-			$out["web"][] = [
-				"title" =>
-					$this->fuckhtml
-					->getTextContent(
-						$a["innerHTML"]
-					),
-				"description" => $description,
-				"url" =>
-					$this->fuckhtml
-					->getTextContent(
-						$a["attributes"]["href"]
-					),
-				"date" => $date,
-				"type" => "web",
-				"thumb" => [
-					"url" => null,
-					"ratio" => null
-				],
-				"sublink" => [],
-				"table" => []
-			];
-		}
-		
-		return $out;
-	}
-	
-	private function limitstrlen($text){
-		
-		return explode("\n", wordwrap($text, 300, "\n"))[0];
-	}
-}