500px scraper
This commit is contained in:
parent
631aa58565
commit
9f60900875
|
@ -119,7 +119,7 @@ class config{
|
||||||
|
|
||||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||||
// Changing this might break things.
|
// Changing this might break things.
|
||||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0";
|
||||||
|
|
||||||
// Proxy pool assignments for each scraper
|
// Proxy pool assignments for each scraper
|
||||||
// false = Use server's raw IP
|
// false = Use server's raw IP
|
||||||
|
@ -143,6 +143,7 @@ class config{
|
||||||
const PROXY_YT = false; // youtube
|
const PROXY_YT = false; // youtube
|
||||||
const PROXY_YEP = false;
|
const PROXY_YEP = false;
|
||||||
const PROXY_PINTEREST = false;
|
const PROXY_PINTEREST = false;
|
||||||
|
const PROXY_FIVEHPX = false;
|
||||||
const PROXY_SEZNAM = false;
|
const PROXY_SEZNAM = false;
|
||||||
const PROXY_NAVER = false;
|
const PROXY_NAVER = false;
|
||||||
const PROXY_GREPPR = false;
|
const PROXY_GREPPR = false;
|
||||||
|
|
|
@ -970,6 +970,7 @@ class frontend{
|
||||||
"yep" => "Yep",
|
"yep" => "Yep",
|
||||||
"solofield" => "Solofield",
|
"solofield" => "Solofield",
|
||||||
"pinterest" => "Pinterest",
|
"pinterest" => "Pinterest",
|
||||||
|
"fivehpx" => "500px",
|
||||||
"imgur" => "Imgur",
|
"imgur" => "Imgur",
|
||||||
"ftm" => "FindThatMeme"
|
"ftm" => "FindThatMeme"
|
||||||
]
|
]
|
||||||
|
|
|
@ -0,0 +1,262 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
class fivehpx{
|
||||||
|
|
||||||
|
public function __construct(){
|
||||||
|
|
||||||
|
include "lib/backend.php";
|
||||||
|
$this->backend = new backend("fivehpx");
|
||||||
|
|
||||||
|
include "lib/fuckhtml.php";
|
||||||
|
$this->fuckhtml = new fuckhtml();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getfilters($page){
|
||||||
|
|
||||||
|
return [
|
||||||
|
"sort" => [
|
||||||
|
"display" => "Sort",
|
||||||
|
"option" => [
|
||||||
|
"relevance" => "Relevance",
|
||||||
|
"pulse" => "Pulse",
|
||||||
|
"newest" => "Newest"
|
||||||
|
]
|
||||||
|
]
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function get($proxy, $url, $get = [], $post_data = null){
|
||||||
|
|
||||||
|
$curlproc = curl_init();
|
||||||
|
|
||||||
|
if($get !== []){
|
||||||
|
$get = http_build_query($get);
|
||||||
|
$url .= "?" . $get;
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
|
|
||||||
|
if($post_data === null){
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
|
["User-Agent: " . config::USER_AGENT,
|
||||||
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding: gzip",
|
||||||
|
"DNT: 1",
|
||||||
|
"Sec-GPC: 1",
|
||||||
|
"Connection: keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests: 1",
|
||||||
|
"Sec-Fetch-Dest: document",
|
||||||
|
"Sec-Fetch-Mode: navigate",
|
||||||
|
"Sec-Fetch-Site: same-origin",
|
||||||
|
"Sec-Fetch-User: ?1",
|
||||||
|
"Priority: u=0, i",
|
||||||
|
"TE: trailers"]
|
||||||
|
);
|
||||||
|
}else{
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
|
["User-Agent: " . config::USER_AGENT,
|
||||||
|
"Accept: */*",
|
||||||
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding: gzip",
|
||||||
|
"Referer: https://500px.com/",
|
||||||
|
"content-type: application/json",
|
||||||
|
//"x-csrf-token: undefined",
|
||||||
|
"x-500px-source: Search",
|
||||||
|
"Content-Length: " . strlen($post_data),
|
||||||
|
"Origin: https://500px.com",
|
||||||
|
"DNT: 1",
|
||||||
|
"Sec-GPC: 1",
|
||||||
|
"Connection: keep-alive",
|
||||||
|
// "Cookie: _pin_unauth, _fbp, _sharedID, _sharedID_cst",
|
||||||
|
"Sec-Fetch-Dest: empty",
|
||||||
|
"Sec-Fetch-Mode: cors",
|
||||||
|
"Sec-Fetch-Site: same-site",
|
||||||
|
"Priority: u=4",
|
||||||
|
"TE: trailers"]
|
||||||
|
);
|
||||||
|
|
||||||
|
// set post data
|
||||||
|
curl_setopt($curlproc, CURLOPT_POST, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||||||
|
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||||||
|
|
||||||
|
// http2 bypass
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
|
||||||
|
$this->backend->assign_proxy($curlproc, $proxy);
|
||||||
|
|
||||||
|
$data = curl_exec($curlproc);
|
||||||
|
|
||||||
|
if(curl_errno($curlproc)){
|
||||||
|
|
||||||
|
throw new Exception(curl_error($curlproc));
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_close($curlproc);
|
||||||
|
return $data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function image($get){
|
||||||
|
|
||||||
|
if($get["npt"]){
|
||||||
|
|
||||||
|
[$pagination, $proxy] =
|
||||||
|
$this->backend->get(
|
||||||
|
$get["npt"], "images"
|
||||||
|
);
|
||||||
|
|
||||||
|
$pagination = json_decode($pagination, true);
|
||||||
|
$search = $pagination["search"];
|
||||||
|
|
||||||
|
}else{
|
||||||
|
|
||||||
|
$search = $get["s"];
|
||||||
|
if(strlen($search) === 0){
|
||||||
|
|
||||||
|
throw new Exception("Search term is empty!");
|
||||||
|
}
|
||||||
|
|
||||||
|
$proxy = $this->backend->get_ip();
|
||||||
|
$pagination = [
|
||||||
|
"sort" => strtoupper($get["sort"]),
|
||||||
|
"search" => $search,
|
||||||
|
"filters" => [],
|
||||||
|
"nlp" => false,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
try{
|
||||||
|
|
||||||
|
$json =
|
||||||
|
$this->get(
|
||||||
|
$proxy,
|
||||||
|
"https://api.500px.com/graphql",
|
||||||
|
[],
|
||||||
|
json_encode([
|
||||||
|
"operationName" => "PhotoSearchPaginationContainerQuery",
|
||||||
|
"variables" => $pagination,
|
||||||
|
"query" =>
|
||||||
|
'query PhotoSearchPaginationContainerQuery(' .
|
||||||
|
(isset($pagination["cursor"]) ? '$cursor: String, ' : "") .
|
||||||
|
'$sort: PhotoSort, $search: String!, $filters: [PhotoSearchFilter!], $nlp: Boolean) { ...PhotoSearchPaginationContainer_query_1vzAZD} fragment PhotoSearchPaginationContainer_query_1vzAZD on Query { photoSearch(sort: $sort, first: 100, ' .
|
||||||
|
(isset($pagination["cursor"]) ? 'after: $cursor, ' : "") .
|
||||||
|
'search: $search, filters: $filters, nlp: $nlp) { edges { node { id legacyId canonicalPath name description width height images(sizes: [33, 36]) { size url id } } } totalCount pageInfo { endCursor hasNextPage } }}'
|
||||||
|
])
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
|
throw new Exception("Failed to fetch graphQL object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$json = json_decode($json, true);
|
||||||
|
|
||||||
|
if($json === null){
|
||||||
|
|
||||||
|
throw new Exception("Failed to decode graphQL object");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isset($json["errors"][0]["message"])){
|
||||||
|
|
||||||
|
throw new Exception("500px returned an API error: " . $json["errors"][0]["message"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!isset($json["data"]["photoSearch"]["edges"])){
|
||||||
|
|
||||||
|
throw new Exception("No edges returned by API");
|
||||||
|
}
|
||||||
|
|
||||||
|
$out = [
|
||||||
|
"status" => "ok",
|
||||||
|
"npt" => null,
|
||||||
|
"image" => []
|
||||||
|
];
|
||||||
|
|
||||||
|
foreach($json["data"]["photoSearch"]["edges"] as $image){
|
||||||
|
|
||||||
|
$image = $image["node"];
|
||||||
|
$title =
|
||||||
|
trim(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$image["name"]
|
||||||
|
) . ": " .
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$image["description"]
|
||||||
|
)
|
||||||
|
, " :"
|
||||||
|
);
|
||||||
|
|
||||||
|
$small = $this->image_ratio(600, $image["width"], $image["height"]);
|
||||||
|
$large = $this->image_ratio(2048, $image["width"], $image["height"]);
|
||||||
|
|
||||||
|
$out["image"][] = [
|
||||||
|
"title" => $title,
|
||||||
|
"source" => [
|
||||||
|
[
|
||||||
|
"url" => $image["images"][1]["url"],
|
||||||
|
"width" => $large[0],
|
||||||
|
"height" => $large[1]
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"url" => $image["images"][0]["url"],
|
||||||
|
"width" => $small[0],
|
||||||
|
"height" => $small[1]
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"url" => "https://500px.com" . $image["canonicalPath"]
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
// get NPT token
|
||||||
|
if($json["data"]["photoSearch"]["pageInfo"]["hasNextPage"] === true){
|
||||||
|
|
||||||
|
$out["npt"] =
|
||||||
|
$this->backend->store(
|
||||||
|
json_encode([
|
||||||
|
"cursor" => $json["data"]["photoSearch"]["pageInfo"]["endCursor"],
|
||||||
|
"search" => $search,
|
||||||
|
"sort" => $pagination["sort"],
|
||||||
|
"filters" => [],
|
||||||
|
"nlp" => false
|
||||||
|
]),
|
||||||
|
"images",
|
||||||
|
$proxy
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function image_ratio($longest_edge, $width, $height){
|
||||||
|
|
||||||
|
$ratio = [
|
||||||
|
$longest_edge / $width,
|
||||||
|
$longest_edge / $height
|
||||||
|
];
|
||||||
|
|
||||||
|
if($ratio[0] < $ratio[1]){
|
||||||
|
|
||||||
|
$ratio = $ratio[0];
|
||||||
|
}else{
|
||||||
|
|
||||||
|
$ratio = $ratio[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [
|
||||||
|
floor($width * $ratio),
|
||||||
|
floor($height * $ratio)
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
|
@ -231,6 +231,10 @@ $settings = [
|
||||||
"value" => "pinterest",
|
"value" => "pinterest",
|
||||||
"text" => "Pinterest"
|
"text" => "Pinterest"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "fivehpx",
|
||||||
|
"text" => "500px"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "imgur",
|
"value" => "imgur",
|
||||||
"text" => "Imgur"
|
"text" => "Imgur"
|
||||||
|
|
Loading…
Reference in New Issue