From 9f609008758f8f138eb8a7f7f7315dacbf7de224 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 11 Jan 2025 14:12:54 -0500 Subject: [PATCH] 500px scraper --- data/config.php | 3 +- lib/frontend.php | 1 + scraper/fivehpx.php | 262 ++++++++++++++++++++++++++++++++++++++++++++ settings.php | 4 + 4 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 scraper/fivehpx.php diff --git a/data/config.php b/data/config.php index bcda644..028a232 100644 --- a/data/config.php +++ b/data/config.php @@ -119,7 +119,7 @@ class config{ // Default user agent to use for scraper requests. Sometimes ignored to get specific webpages // Changing this might break things. - const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"; + const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0"; // Proxy pool assignments for each scraper // false = Use server's raw IP @@ -143,6 +143,7 @@ class config{ const PROXY_YT = false; // youtube const PROXY_YEP = false; const PROXY_PINTEREST = false; + const PROXY_FIVEHPX = false; const PROXY_SEZNAM = false; const PROXY_NAVER = false; const PROXY_GREPPR = false; diff --git a/lib/frontend.php b/lib/frontend.php index 82fd4bd..a335360 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -970,6 +970,7 @@ class frontend{ "yep" => "Yep", "solofield" => "Solofield", "pinterest" => "Pinterest", + "fivehpx" => "500px", "imgur" => "Imgur", "ftm" => "FindThatMeme" ] diff --git a/scraper/fivehpx.php b/scraper/fivehpx.php new file mode 100644 index 0000000..8a600df --- /dev/null +++ b/scraper/fivehpx.php @@ -0,0 +1,262 @@ +backend = new backend("fivehpx"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "sort" => [ + "display" => "Sort", + "option" => [ + "relevance" => "Relevance", + "pulse" => "Pulse", + "newest" => "Newest" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $post_data = null){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($post_data === null){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://500px.com/", + "content-type: application/json", + //"x-csrf-token: undefined", + "x-500px-source: Search", + "Content-Length: " . strlen($post_data), + "Origin: https://500px.com", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + // "Cookie: _pin_unauth, _fbp, _sharedID, _sharedID_cst", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "Priority: u=4", + "TE: trailers"] + ); + + // set post data + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + if($get["npt"]){ + + [$pagination, $proxy] = + $this->backend->get( + $get["npt"], "images" + ); + + $pagination = json_decode($pagination, true); + $search = $pagination["search"]; + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $pagination = [ + "sort" => strtoupper($get["sort"]), + "search" => $search, + "filters" => [], + "nlp" => false, + ]; + } + + try{ + + $json = + $this->get( + $proxy, + "https://api.500px.com/graphql", + [], + json_encode([ + "operationName" => "PhotoSearchPaginationContainerQuery", + "variables" => $pagination, + "query" => + 'query PhotoSearchPaginationContainerQuery(' . + (isset($pagination["cursor"]) ? '$cursor: String, ' : "") . + '$sort: PhotoSort, $search: String!, $filters: [PhotoSearchFilter!], $nlp: Boolean) { ...PhotoSearchPaginationContainer_query_1vzAZD} fragment PhotoSearchPaginationContainer_query_1vzAZD on Query { photoSearch(sort: $sort, first: 100, ' . + (isset($pagination["cursor"]) ? 'after: $cursor, ' : "") . + 'search: $search, filters: $filters, nlp: $nlp) { edges { node { id legacyId canonicalPath name description width height images(sizes: [33, 36]) { size url id } } } totalCount pageInfo { endCursor hasNextPage } }}' + ]) + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch graphQL object"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode graphQL object"); + } + + if(isset($json["errors"][0]["message"])){ + + throw new Exception("500px returned an API error: " . $json["errors"][0]["message"]); + } + + if(!isset($json["data"]["photoSearch"]["edges"])){ + + throw new Exception("No edges returned by API"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + foreach($json["data"]["photoSearch"]["edges"] as $image){ + + $image = $image["node"]; + $title = + trim( + $this->fuckhtml + ->getTextContent( + $image["name"] + ) . ": " . + $this->fuckhtml + ->getTextContent( + $image["description"] + ) + , " :" + ); + + $small = $this->image_ratio(600, $image["width"], $image["height"]); + $large = $this->image_ratio(2048, $image["width"], $image["height"]); + + $out["image"][] = [ + "title" => $title, + "source" => [ + [ + "url" => $image["images"][1]["url"], + "width" => $large[0], + "height" => $large[1] + ], + [ + "url" => $image["images"][0]["url"], + "width" => $small[0], + "height" => $small[1] + ] + ], + "url" => "https://500px.com" . $image["canonicalPath"] + ]; + } + + // get NPT token + if($json["data"]["photoSearch"]["pageInfo"]["hasNextPage"] === true){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "cursor" => $json["data"]["photoSearch"]["pageInfo"]["endCursor"], + "search" => $search, + "sort" => $pagination["sort"], + "filters" => [], + "nlp" => false + ]), + "images", + $proxy + ); + } + + return $out; + } + + private function image_ratio($longest_edge, $width, $height){ + + $ratio = [ + $longest_edge / $width, + $longest_edge / $height + ]; + + if($ratio[0] < $ratio[1]){ + + $ratio = $ratio[0]; + }else{ + + $ratio = $ratio[1]; + } + + return [ + floor($width * $ratio), + floor($height * $ratio) + ]; + } +} diff --git a/settings.php b/settings.php index a3db7c4..6b3f774 100644 --- a/settings.php +++ b/settings.php @@ -231,6 +231,10 @@ $settings = [ "value" => "pinterest", "text" => "Pinterest" ], + [ + "value" => "fivehpx", + "text" => "500px" + ], [ "value" => "imgur", "text" => "Imgur"