From f30872134fc925ca47b2b20830290904141ea143 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 3 Aug 2025 12:28:57 -0400 Subject: [PATCH] handle mojeek block --- scraper/mojeek.php | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/scraper/mojeek.php b/scraper/mojeek.php index b2d6ed5..c15d34c 100644 --- a/scraper/mojeek.php +++ b/scraper/mojeek.php @@ -501,11 +501,6 @@ class mojeek{ throw new Exception("Failed to get HTML"); } - /* - $handle = fopen("scraper/mojeek.html", "r"); - $html = fread($handle, filesize("scraper/mojeek.html")); - fclose($handle);*/ - } $out = [ @@ -526,6 +521,8 @@ class mojeek{ $this->fuckhtml->load($html); + $this->detect_block(); + $results = $this->fuckhtml ->getElementsByClassName("results-standard", "ul"); @@ -1034,6 +1031,8 @@ class mojeek{ $this->fuckhtml->load($html); + $this->detect_block(); + $articles = $this->fuckhtml->getElementsByTagName("article"); @@ -1166,6 +1165,26 @@ class mojeek{ return $out; } + private function detect_block(){ + + $title = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($title) !== 0 && + $this->fuckhtml + ->getTextContent( + $title[0]["innerHTML"] + ) == "403 - Forbidden" + ){ + + throw new Exception("Mojeek blocked this instance or request proxy."); + } + } + private function titledots($title){ return trim($title, ". \t\n\r\0\x0B");