From fbac3eeb8dedb961f55983f02d3c3a84ab0e7327 Mon Sep 17 00:00:00 2001 From: lolcat Date: Thu, 8 Aug 2024 03:29:29 -0400 Subject: [PATCH] fixed mwmbl, results are slightly better but wtf did they do to the sublinks my gawd --- docs/configure.md | 4 +-- scraper/mwmbl.php | 78 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 7 deletions(-) diff --git a/docs/configure.md b/docs/configure.md index 7cc4175..b5b88e4 100644 --- a/docs/configure.md +++ b/docs/configure.md @@ -8,10 +8,10 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png` 4. The captcha font is located in `data/fonts/captcha.ttf` -# Cloudflare bypass +# Cloudflare bypass (TLS check) **Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.** -Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** search engine. Following these instructions might make your package manager unhappy. +Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** & the **Mwmbl** search engines. Please be aware that APT will fight against you and will re-install the openSSL-version of curl constantly when updating. First, follow these instructions. Only install the Firefox modules: diff --git a/scraper/mwmbl.php b/scraper/mwmbl.php index 671ec78..f2f8b70 100644 --- a/scraper/mwmbl.php +++ b/scraper/mwmbl.php @@ -27,18 +27,24 @@ class mwmbl{ curl_setopt($curlproc, CURLOPT_URL, $url); + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", + "Referer: https://beta.mwmbl.org/", "DNT: 1", + "Sec-GPC: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: none", + "Sec-Fetch-Site: same-origin", + "Priority: u=0, i", "Sec-Fetch-User: ?1"] ); @@ -46,7 +52,7 @@ class mwmbl{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); - curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset $this->backend->assign_proxy($curlproc, $proxy); @@ -72,14 +78,14 @@ class mwmbl{ try{ $html = $this->get( $this->backend->get_ip(), // no next page! - "https://mwmbl.org/app/home/", + "https://beta.mwmbl.org/", [ "q" => $search ] ); }catch(Exception $error){ - throw new Exception("Failed to fetch HTML"); + throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup."); } $out = [ @@ -115,6 +121,68 @@ class mwmbl{ $this->fuckhtml ->getElementsByTagName("p"); + $sublinks = []; + + $mores = + $this->fuckhtml + ->getElementsByClassName( + "result-link-more", + "div" + ); + + foreach($mores as $more){ + + $this->fuckhtml->load($more); + + $as = + $this->fuckhtml + ->getElementsByClassName( + "more", + "a" + ); + + if(count($as) === 0){ + + // ?? invalid + continue; + } + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "more-title", + "span" + )[0] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "more-extract", + "span" + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $as[0] + ["attributes"] + ["href"] + ) + ]; + } + + // reset + $this->fuckhtml->load($result); + $out["web"][] = [ "title" => $this->titledots( @@ -153,7 +221,7 @@ class mwmbl{ "url" => null, "ratio" => null ], - "sublink" => [], + "sublink" => $sublinks, "table" => [] ]; }