From 9f609008758f8f138eb8a7f7f7315dacbf7de224 Mon Sep 17 00:00:00 2001
From: lolcat <will@lolcat.ca>
Date: Sat, 11 Jan 2025 14:12:54 -0500
Subject: [PATCH] 500px scraper

---
 data/config.php     |   3 +-
 lib/frontend.php    |   1 +
 scraper/fivehpx.php | 262 ++++++++++++++++++++++++++++++++++++++++++++
 settings.php        |   4 +
 4 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 scraper/fivehpx.php

diff --git a/data/config.php b/data/config.php
index bcda644..028a232 100644
--- a/data/config.php
+++ b/data/config.php
@@ -119,7 +119,7 @@ class config{
 	
 	// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
 	// Changing this might break things.
-	const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0";
+	const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0";
 	
 	// Proxy pool assignments for each scraper
 	// false = Use server's raw IP
@@ -143,6 +143,7 @@ class config{
 	const PROXY_YT = false; // youtube
 	const PROXY_YEP = false;
 	const PROXY_PINTEREST = false;
+	const PROXY_FIVEHPX = false;
 	const PROXY_SEZNAM = false;
 	const PROXY_NAVER = false;
 	const PROXY_GREPPR = false;
diff --git a/lib/frontend.php b/lib/frontend.php
index 82fd4bd..a335360 100644
--- a/lib/frontend.php
+++ b/lib/frontend.php
@@ -970,6 +970,7 @@ class frontend{
 						"yep" => "Yep",
 						"solofield" => "Solofield",
 						"pinterest" => "Pinterest",
+						"fivehpx" => "500px",
 						"imgur" => "Imgur",
 						"ftm" => "FindThatMeme"
 					]
diff --git a/scraper/fivehpx.php b/scraper/fivehpx.php
new file mode 100644
index 0000000..8a600df
--- /dev/null
+++ b/scraper/fivehpx.php
@@ -0,0 +1,262 @@
+<?php
+
+class fivehpx{
+	
+	public function __construct(){
+		
+		include "lib/backend.php";
+		$this->backend = new backend("fivehpx");
+		
+		include "lib/fuckhtml.php";
+		$this->fuckhtml = new fuckhtml();
+	}
+	
+	public function getfilters($page){
+		
+		return [
+			"sort" => [
+				"display" => "Sort",
+				"option" => [
+					"relevance" => "Relevance",
+					"pulse" => "Pulse",
+					"newest" => "Newest"
+				]
+			]
+		];
+	}
+	
+	private function get($proxy, $url, $get = [], $post_data = null){
+		
+		$curlproc = curl_init();
+		
+		if($get !== []){
+			$get = http_build_query($get);
+			$url .= "?" . $get;
+		}
+		
+		curl_setopt($curlproc, CURLOPT_URL, $url);
+		
+		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+		
+		if($post_data === null){
+			
+			curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+				["User-Agent: " . config::USER_AGENT,
+				"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+				"Accept-Language: en-US,en;q=0.5",
+				"Accept-Encoding: gzip",
+				"DNT: 1",
+				"Sec-GPC: 1",
+				"Connection: keep-alive",
+				"Upgrade-Insecure-Requests: 1",
+				"Sec-Fetch-Dest: document",
+				"Sec-Fetch-Mode: navigate",
+				"Sec-Fetch-Site: same-origin",
+				"Sec-Fetch-User: ?1",
+				"Priority: u=0, i",
+				"TE: trailers"]
+			);
+		}else{
+			
+			curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+				["User-Agent: " . config::USER_AGENT,
+				"Accept: */*",
+				"Accept-Language: en-US,en;q=0.5",
+				"Accept-Encoding: gzip",
+				"Referer: https://500px.com/",
+				"content-type: application/json",
+				//"x-csrf-token: undefined",
+				"x-500px-source: Search",
+				"Content-Length: " . strlen($post_data),
+				"Origin: https://500px.com",
+				"DNT: 1",
+				"Sec-GPC: 1",
+				"Connection: keep-alive",
+				// "Cookie: _pin_unauth, _fbp, _sharedID, _sharedID_cst",
+				"Sec-Fetch-Dest: empty",
+				"Sec-Fetch-Mode: cors",
+				"Sec-Fetch-Site: same-site",
+				"Priority: u=4",
+				"TE: trailers"]
+			);
+						
+			// set post data
+			curl_setopt($curlproc, CURLOPT_POST, true);
+			curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data);
+		}
+		
+		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+		
+		// http2 bypass
+		curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+		
+		$this->backend->assign_proxy($curlproc, $proxy);
+		
+		$data = curl_exec($curlproc);
+		
+		if(curl_errno($curlproc)){
+			
+			throw new Exception(curl_error($curlproc));
+		}
+		
+		curl_close($curlproc);
+		return $data;
+	}
+	
+	public function image($get){
+		
+		if($get["npt"]){
+			
+			[$pagination, $proxy] =
+				$this->backend->get(
+					$get["npt"], "images"
+				);
+			
+			$pagination = json_decode($pagination, true);
+			$search = $pagination["search"];
+			
+		}else{
+			
+			$search = $get["s"];
+			if(strlen($search) === 0){
+				
+				throw new Exception("Search term is empty!");
+			}
+			
+			$proxy = $this->backend->get_ip();
+			$pagination = [
+				"sort" => strtoupper($get["sort"]),
+				"search" => $search,
+				"filters" => [],
+				"nlp" => false,
+			];
+		}
+		
+		try{
+			
+			$json =
+				$this->get(
+					$proxy,
+					"https://api.500px.com/graphql",
+					[],
+					json_encode([
+						"operationName" => "PhotoSearchPaginationContainerQuery",
+						"variables" => $pagination,
+						"query" =>
+							'query PhotoSearchPaginationContainerQuery(' .
+							(isset($pagination["cursor"]) ? '$cursor: String, ' : "") .
+							'$sort: PhotoSort, $search: String!, $filters: [PhotoSearchFilter!], $nlp: Boolean) {  ...PhotoSearchPaginationContainer_query_1vzAZD} fragment PhotoSearchPaginationContainer_query_1vzAZD on Query { photoSearch(sort: $sort, first: 100, ' .
+							(isset($pagination["cursor"]) ? 'after: $cursor, ' : "") .
+							'search: $search, filters: $filters, nlp: $nlp) { edges { node { id legacyId canonicalPath name description width height images(sizes: [33, 36]) { size url id } } } totalCount pageInfo { endCursor hasNextPage } }}'
+					])
+				);
+		}catch(Exception $error){
+			
+			throw new Exception("Failed to fetch graphQL object");
+		}
+		
+		$json = json_decode($json, true);
+		
+		if($json === null){
+			
+			throw new Exception("Failed to decode graphQL object");
+		}
+		
+		if(isset($json["errors"][0]["message"])){
+			
+			throw new Exception("500px returned an API error: " . $json["errors"][0]["message"]);
+		}
+		
+		if(!isset($json["data"]["photoSearch"]["edges"])){
+			
+			throw new Exception("No edges returned by API");
+		}
+		
+		$out = [
+			"status" => "ok",
+			"npt" => null,
+			"image" => []
+		];
+		
+		foreach($json["data"]["photoSearch"]["edges"] as $image){
+			
+			$image = $image["node"];
+			$title =
+				trim(
+					$this->fuckhtml
+					->getTextContent(
+						$image["name"]
+					) . ": " .
+					$this->fuckhtml
+					->getTextContent(
+						$image["description"]
+					)
+					, " :"
+				);
+			
+			$small = $this->image_ratio(600, $image["width"], $image["height"]);
+			$large = $this->image_ratio(2048, $image["width"], $image["height"]);
+			
+			$out["image"][] = [
+				"title" => $title,
+				"source" => [
+					[
+						"url" => $image["images"][1]["url"],
+						"width" => $large[0],
+						"height" => $large[1]
+					],
+					[
+						"url" => $image["images"][0]["url"],
+						"width" => $small[0],
+						"height" => $small[1]
+					]
+				],
+				"url" => "https://500px.com" . $image["canonicalPath"]
+			];
+		}
+		
+		// get NPT token
+		if($json["data"]["photoSearch"]["pageInfo"]["hasNextPage"] === true){
+			
+			$out["npt"] =
+				$this->backend->store(
+					json_encode([
+						"cursor" => $json["data"]["photoSearch"]["pageInfo"]["endCursor"],
+						"search" => $search,
+						"sort" => $pagination["sort"],
+						"filters" => [],
+						"nlp" => false
+					]),
+					"images",
+					$proxy
+				);
+		}
+			
+		return $out;
+	}
+	
+	private function image_ratio($longest_edge, $width, $height){
+		
+		$ratio = [
+			$longest_edge / $width,
+			$longest_edge / $height
+		];
+		
+		if($ratio[0] < $ratio[1]){
+			
+			$ratio = $ratio[0];
+		}else{
+			
+			$ratio = $ratio[1];
+		}
+		
+		return [
+			floor($width * $ratio),
+			floor($height * $ratio)
+		];
+	}
+}
diff --git a/settings.php b/settings.php
index a3db7c4..6b3f774 100644
--- a/settings.php
+++ b/settings.php
@@ -231,6 +231,10 @@ $settings = [
 						"value" => "pinterest",
 						"text" => "Pinterest"
 					],
+					[
+						"value" => "fivehpx",
+						"text" => "500px"
+					],
 					[
 						"value" => "imgur",
 						"text" => "Imgur"