forked from lolcat/4get
		
	fixed mwmbl, results are slightly better but wtf did they do to the sublinks my gawd
This commit is contained in:
		@@ -8,10 +8,10 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
 | 
				
			|||||||
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
 | 
					3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
 | 
				
			||||||
4. The captcha font is located in `data/fonts/captcha.ttf`
 | 
					4. The captcha font is located in `data/fonts/captcha.ttf`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Cloudflare bypass
 | 
					# Cloudflare bypass (TLS check)
 | 
				
			||||||
**Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.**
 | 
					**Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** search engine. Following these instructions might make your package manager unhappy.
 | 
					Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** & the **Mwmbl** search engines. Please be aware that APT will fight against you and will re-install the openSSL-version of curl constantly when updating.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
First, follow these instructions. Only install the Firefox modules:
 | 
					First, follow these instructions. Only install the Firefox modules:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -27,18 +27,24 @@ class mwmbl{
 | 
				
			|||||||
		
 | 
							
 | 
				
			||||||
		curl_setopt($curlproc, CURLOPT_URL, $url);
 | 
							curl_setopt($curlproc, CURLOPT_URL, $url);
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
 | 
							// use http2
 | 
				
			||||||
 | 
							curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
 | 
				
			||||||
 | 
							
 | 
				
			||||||
		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | 
							curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | 
				
			||||||
		curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | 
							curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | 
				
			||||||
			["User-Agent: " . config::USER_AGENT,
 | 
								["User-Agent: " . config::USER_AGENT,
 | 
				
			||||||
			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
 | 
								"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
 | 
				
			||||||
			"Accept-Language: en-US,en;q=0.5",
 | 
								"Accept-Language: en-US,en;q=0.5",
 | 
				
			||||||
			"Accept-Encoding: gzip",
 | 
								"Accept-Encoding: gzip",
 | 
				
			||||||
 | 
								"Referer: https://beta.mwmbl.org/",
 | 
				
			||||||
			"DNT: 1",
 | 
								"DNT: 1",
 | 
				
			||||||
 | 
								"Sec-GPC: 1",
 | 
				
			||||||
			"Connection: keep-alive",
 | 
								"Connection: keep-alive",
 | 
				
			||||||
			"Upgrade-Insecure-Requests: 1",
 | 
								"Upgrade-Insecure-Requests: 1",
 | 
				
			||||||
			"Sec-Fetch-Dest: document",
 | 
								"Sec-Fetch-Dest: document",
 | 
				
			||||||
			"Sec-Fetch-Mode: navigate",
 | 
								"Sec-Fetch-Mode: navigate",
 | 
				
			||||||
			"Sec-Fetch-Site: none",
 | 
								"Sec-Fetch-Site: same-origin",
 | 
				
			||||||
 | 
								"Priority: u=0, i",
 | 
				
			||||||
			"Sec-Fetch-User: ?1"]
 | 
								"Sec-Fetch-User: ?1"]
 | 
				
			||||||
		);
 | 
							);
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
@@ -46,7 +52,7 @@ class mwmbl{
 | 
				
			|||||||
		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 | 
							curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 | 
				
			||||||
		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 | 
							curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 | 
				
			||||||
		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
 | 
							curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
 | 
				
			||||||
		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
 | 
							curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
		$this->backend->assign_proxy($curlproc, $proxy);
 | 
							$this->backend->assign_proxy($curlproc, $proxy);
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
@@ -72,14 +78,14 @@ class mwmbl{
 | 
				
			|||||||
		try{
 | 
							try{
 | 
				
			||||||
			$html = $this->get(
 | 
								$html = $this->get(
 | 
				
			||||||
				$this->backend->get_ip(), // no next page!
 | 
									$this->backend->get_ip(), // no next page!
 | 
				
			||||||
				"https://mwmbl.org/app/home/",
 | 
									"https://beta.mwmbl.org/",
 | 
				
			||||||
				[
 | 
									[
 | 
				
			||||||
					"q" => $search
 | 
										"q" => $search
 | 
				
			||||||
				]
 | 
									]
 | 
				
			||||||
			);
 | 
								);
 | 
				
			||||||
		}catch(Exception $error){
 | 
							}catch(Exception $error){
 | 
				
			||||||
			
 | 
								
 | 
				
			||||||
			throw new Exception("Failed to fetch HTML");
 | 
								throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup.");
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
		$out = [
 | 
							$out = [
 | 
				
			||||||
@@ -115,6 +121,68 @@ class mwmbl{
 | 
				
			|||||||
				$this->fuckhtml
 | 
									$this->fuckhtml
 | 
				
			||||||
				->getElementsByTagName("p");
 | 
									->getElementsByTagName("p");
 | 
				
			||||||
			
 | 
								
 | 
				
			||||||
 | 
								$sublinks = [];
 | 
				
			||||||
 | 
								
 | 
				
			||||||
 | 
								$mores =
 | 
				
			||||||
 | 
									$this->fuckhtml
 | 
				
			||||||
 | 
									->getElementsByClassName(
 | 
				
			||||||
 | 
										"result-link-more",
 | 
				
			||||||
 | 
										"div"
 | 
				
			||||||
 | 
									);
 | 
				
			||||||
 | 
								
 | 
				
			||||||
 | 
								foreach($mores as $more){
 | 
				
			||||||
 | 
									
 | 
				
			||||||
 | 
									$this->fuckhtml->load($more);
 | 
				
			||||||
 | 
									
 | 
				
			||||||
 | 
									$as =
 | 
				
			||||||
 | 
										$this->fuckhtml
 | 
				
			||||||
 | 
										->getElementsByClassName(
 | 
				
			||||||
 | 
											"more",
 | 
				
			||||||
 | 
											"a"
 | 
				
			||||||
 | 
										);
 | 
				
			||||||
 | 
									
 | 
				
			||||||
 | 
									if(count($as) === 0){
 | 
				
			||||||
 | 
										
 | 
				
			||||||
 | 
										// ?? invalid
 | 
				
			||||||
 | 
										continue;
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
									
 | 
				
			||||||
 | 
									$sublinks[] = [
 | 
				
			||||||
 | 
										"title" =>
 | 
				
			||||||
 | 
											$this->titledots(
 | 
				
			||||||
 | 
												$this->fuckhtml
 | 
				
			||||||
 | 
												->getTextContent(
 | 
				
			||||||
 | 
													$this->fuckhtml
 | 
				
			||||||
 | 
													->getElementsByClassName(
 | 
				
			||||||
 | 
														"more-title",
 | 
				
			||||||
 | 
														"span"
 | 
				
			||||||
 | 
													)[0]
 | 
				
			||||||
 | 
												)
 | 
				
			||||||
 | 
											),
 | 
				
			||||||
 | 
										"description" =>
 | 
				
			||||||
 | 
											$this->titledots(
 | 
				
			||||||
 | 
												$this->fuckhtml
 | 
				
			||||||
 | 
												->getTextContent(
 | 
				
			||||||
 | 
													$this->fuckhtml
 | 
				
			||||||
 | 
													->getElementsByClassName(
 | 
				
			||||||
 | 
														"more-extract",
 | 
				
			||||||
 | 
														"span"
 | 
				
			||||||
 | 
													)[0]
 | 
				
			||||||
 | 
												)
 | 
				
			||||||
 | 
											),
 | 
				
			||||||
 | 
										"url" =>
 | 
				
			||||||
 | 
											$this->fuckhtml
 | 
				
			||||||
 | 
											->getTextContent(
 | 
				
			||||||
 | 
												$as[0]
 | 
				
			||||||
 | 
												["attributes"]
 | 
				
			||||||
 | 
												["href"]
 | 
				
			||||||
 | 
											)
 | 
				
			||||||
 | 
									];
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								
 | 
				
			||||||
 | 
								// reset
 | 
				
			||||||
 | 
								$this->fuckhtml->load($result);
 | 
				
			||||||
 | 
								
 | 
				
			||||||
			$out["web"][] = [
 | 
								$out["web"][] = [
 | 
				
			||||||
				"title" =>
 | 
									"title" =>
 | 
				
			||||||
					$this->titledots(
 | 
										$this->titledots(
 | 
				
			||||||
@@ -153,7 +221,7 @@ class mwmbl{
 | 
				
			|||||||
					"url" => null,
 | 
										"url" => null,
 | 
				
			||||||
					"ratio" => null
 | 
										"ratio" => null
 | 
				
			||||||
				],
 | 
									],
 | 
				
			||||||
				"sublink" => [],
 | 
									"sublink" => $sublinks,
 | 
				
			||||||
				"table" => []
 | 
									"table" => []
 | 
				
			||||||
			];
 | 
								];
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user