Compare commits

..

16 Commits

Author SHA1 Message Date
lolcat 033e4cb959 added vsco scraper 2025-01-11 23:07:58 -05:00
lolcat 91f621e105 readme changes 2025-01-11 14:34:23 -05:00
lolcat 9f60900875 500px scraper 2025-01-11 14:12:54 -05:00
lolcat 631aa58565 marginalia hotfix 2025-01-07 21:12:07 -05:00
lolcat b892f90b13 readme changes 2025-01-07 00:05:42 -05:00
lolcat 463ba0775f added pinterest 2025-01-06 23:56:49 -05:00
lolcat cfad4fb035 wordnik bugfix 2025-01-06 21:05:45 -05:00
lolcat 4e968b4b1c youtube scraper fix 2025-01-03 22:22:30 -05:00
lolcat 81df52235c fixed google crash 2025-01-03 21:43:40 -05:00
lolcat 1ca2626ad9 fix ddg bug with EOF result 2025-01-03 21:16:00 -05:00
lolcat 9ca93f34c6 ddg hotfix 2024-12-17 21:01:36 -05:00
lolcat 0a43b9c849 added arquivo.pt 2024-12-17 10:11:53 -05:00
lolcat b636fec319 fucking git is so shit 2024-12-17 00:35:15 -05:00
lolcat 774f7113df duckduckgo scraper rewrite 2024-12-17 00:31:15 -05:00
lolcat 0b3bbe0f15 gore's shitty theme fix 2024-12-02 15:19:10 -05:00
lolcat 5f0b0a7b83 findthatmeme fix 2024-12-01 15:59:03 -05:00
17 changed files with 2353 additions and 2391 deletions

View File

@ -1,48 +0,0 @@
name: '4get CI'
on:
workflow_dispatch:
push:
branches:
- '*'
paths-ignore:
- 'README.md'
- 'docker-compose.yaml'
- '.gitignore'
- 'docs/**'
jobs:
build:
runs-on: docker
steps:
- uses: actions/checkout@v4
name: Checkout 4get repository
- uses: docker/setup-buildx-action@v3
name: Setup Docker BuildX system
- name: Login to Docker Container Registry
uses: docker/login-action@v3
with:
registry: git.lolcat.ca
username: ${{ secrets.USERNAME }}
password: ${{ secrets.TOKEN }}
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: git.lolcat.ca/lolcat/4get
tags: |
type=sha,format=short,prefix={{date 'YYYY.MM.DD'}}-,enable=${{ github.ref == format('refs/heads/{0}', 'master') }}
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'master') }}
- uses: docker/build-push-action@v6
name: Build images
with:
context: .
file: Dockerfile
tags: ${{ steps.meta.outputs.tags }}
platforms: linux/amd64
push: true

View File

@ -9,9 +9,11 @@ https://4get.ca/about
## Official instance
https://4get.ca , or visit the official instance list: https://4get.ca/instances
_NOT to be confused with 4get.ch, 4get.lol and friends! I **don't** host these._
## Totally unbiased comparison between alternatives
| | 4get | searx(ng) | libreY | araa | hearch |
| | 4get | searx(ng) | libreY | araa | hearch.co |
|----------------------------|-------------------------|-----------|-------------|-----------|-------------------|
| RAM usage | 200-400mb~ | 2GB~ | 200-400mb~ | 2GB~ | idk |
| Does it suck | no (debunked by snopes) | yes | yes | a little | better than searx |
@ -23,9 +25,9 @@ https://4get.ca , or visit the official instance list: https://4get.ca/instances
3. Bot protection that *actually* filters out the bots (when configured)
4. Interface doesn't require javascript
5. Favicon fetcher with caching support & image proxy
6. Bunch of other shit
6. Bunch of other shits
tl;dr the best way to actually browse for shit.
tl;dr 4get is the best way to browse for shit.
# Supported websites
@ -39,11 +41,11 @@ tl;dr the best way to actually browse for shit.
| Qwant | Qwant | Startpage | Mojeek | | Kagi |
| Ghostery | Yep | Qwant | | | Qwant |
| Yep | Solofield | Solofield | | | Ghostery |
| Greppr | Imgur | | | | Yep |
| Crowdview | FindThatMeme | | | | Marginalia |
| Mwmbl | | | | | YouTube |
| Mojeek | | | | | Soundcloud |
| Solofield | | | | | |
| Greppr | Pinterest | | | | Yep |
| Crowdview | 500px | | | | Marginalia |
| Mwmbl | VSCO | | | | YouTube |
| Mojeek | Imgur | | | | Soundcloud |
| Solofield | FindThatMeme | | | | |
| Marginalia | | | | | |
| wiby | | | | | |
| Curlie | | | | | |
@ -52,7 +54,7 @@ tl;dr the best way to actually browse for shit.
Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/">documentation index</a>. I recommend following the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">apache2 guide</a>.
## Contact
Shit breaks all the time but I repair it all the time too... Email me here: <b>will (at) lolcat.ca</b> or create an issue.
Shit breaks all the time but I repair it all the time too. Email me here: <b>will (at) lolcat.ca</b> or create an issue.
## License
AGPL

19
api.txt
View File

@ -1,9 +1,16 @@
__ __ __
/ // / ____ ____ / /_
/ // /_/ __ `/ _ \/ __/
/__ __/ /_/ / __/ /_
/_/ \__, /\___/\__/
/____/
44
4444444 44
44444444 44444 444
44444444 444444 444444444
44444 44444444 444444444
444444444 4444444
4444444444 444444
4444444444444
444444444444444444
444444444444444
44444444
4444
44
+ Welcome to the 4get API documentation +

View File

@ -119,7 +119,7 @@ class config{
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
// Changing this might break things.
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0";
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0";
// Proxy pool assignments for each scraper
// false = Use server's raw IP
@ -143,6 +143,8 @@ class config{
const PROXY_YT = false; // youtube
const PROXY_YEP = false;
const PROXY_PINTEREST = false;
const PROXY_FIVEHPX = false;
const PROXY_VSCO = false;
const PROXY_SEZNAM = false;
const PROXY_NAVER = false;
const PROXY_GREPPR = false;

View File

@ -75,6 +75,7 @@ class backend{
break;
case "socks5_hostname":
case "socks5h":
case "socks5a":
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);

View File

@ -838,10 +838,10 @@ class frontend{
}
$payload .=
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
'<a href="https://arquivo.pt/wayback/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://arquivo.pt" alt="ar">Arquivo.pt</a>' .
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .
'</div>';
@ -969,7 +969,9 @@ class frontend{
"qwant" => "Qwant",
"yep" => "Yep",
"solofield" => "Solofield",
//"pinterest" => "Pinterest",
"pinterest" => "Pinterest",
"fivehpx" => "500px",
"vsco" => "VSCO",
"imgur" => "Imgur",
"ftm" => "FindThatMeme"
]

View File

@ -526,4 +526,85 @@ class fuckhtml{
$string
);
}
public function extract_json($json){
$len = strlen($json);
$array_level = 0;
$object_level = 0;
$in_quote = null;
$start = null;
for($i=0; $i<$len; $i++){
switch($json[$i]){
case "[":
if($in_quote === null){
$array_level++;
if($start === null){
$start = $i;
}
}
break;
case "]":
if($in_quote === null){
$array_level--;
}
break;
case "{":
if($in_quote === null){
$object_level++;
if($start === null){
$start = $i;
}
}
break;
case "}":
if($in_quote === null){
$object_level--;
}
break;
case "\"":
case "'":
if(
$i !== 0 &&
$json[$i - 1] !== "\\"
){
// found a non-escaped quote
if($in_quote === null){
// open quote
$in_quote = $json[$i];
}elseif($in_quote === $json[$i]){
// close quote
$in_quote = null;
}
}
break;
}
if(
$start !== null &&
$array_level === 0 &&
$object_level === 0
){
return substr($json, $start, $i - $start + 1);
break;
}
}
}
}

File diff suppressed because it is too large Load Diff

262
scraper/fivehpx.php Normal file
View File

@ -0,0 +1,262 @@
<?php
class fivehpx{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("fivehpx");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [
"sort" => [
"display" => "Sort",
"option" => [
"relevance" => "Relevance",
"pulse" => "Pulse",
"newest" => "Newest"
]
]
];
}
private function get($proxy, $url, $get = [], $post_data = null){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($post_data === null){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i",
"TE: trailers"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://500px.com/",
"content-type: application/json",
//"x-csrf-token: undefined",
"x-500px-source: Search",
"Content-Length: " . strlen($post_data),
"Origin: https://500px.com",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
// "Cookie: _pin_unauth, _fbp, _sharedID, _sharedID_cst",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"Priority: u=4",
"TE: trailers"]
);
// set post data
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function image($get){
if($get["npt"]){
[$pagination, $proxy] =
$this->backend->get(
$get["npt"], "images"
);
$pagination = json_decode($pagination, true);
$search = $pagination["search"];
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$pagination = [
"sort" => strtoupper($get["sort"]),
"search" => $search,
"filters" => [],
"nlp" => false,
];
}
try{
$json =
$this->get(
$proxy,
"https://api.500px.com/graphql",
[],
json_encode([
"operationName" => "PhotoSearchPaginationContainerQuery",
"variables" => $pagination,
"query" =>
'query PhotoSearchPaginationContainerQuery(' .
(isset($pagination["cursor"]) ? '$cursor: String, ' : "") .
'$sort: PhotoSort, $search: String!, $filters: [PhotoSearchFilter!], $nlp: Boolean) { ...PhotoSearchPaginationContainer_query_1vzAZD} fragment PhotoSearchPaginationContainer_query_1vzAZD on Query { photoSearch(sort: $sort, first: 100, ' .
(isset($pagination["cursor"]) ? 'after: $cursor, ' : "") .
'search: $search, filters: $filters, nlp: $nlp) { edges { node { id legacyId canonicalPath name description width height images(sizes: [33, 36]) { size url id } } } totalCount pageInfo { endCursor hasNextPage } }}'
])
);
}catch(Exception $error){
throw new Exception("Failed to fetch graphQL object");
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode graphQL object");
}
if(isset($json["errors"][0]["message"])){
throw new Exception("500px returned an API error: " . $json["errors"][0]["message"]);
}
if(!isset($json["data"]["photoSearch"]["edges"])){
throw new Exception("No edges returned by API");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
foreach($json["data"]["photoSearch"]["edges"] as $image){
$image = $image["node"];
$title =
trim(
$this->fuckhtml
->getTextContent(
$image["name"]
) . ": " .
$this->fuckhtml
->getTextContent(
$image["description"]
)
, " :"
);
$small = $this->image_ratio(600, $image["width"], $image["height"]);
$large = $this->image_ratio(2048, $image["width"], $image["height"]);
$out["image"][] = [
"title" => $title,
"source" => [
[
"url" => $image["images"][1]["url"],
"width" => $large[0],
"height" => $large[1]
],
[
"url" => $image["images"][0]["url"],
"width" => $small[0],
"height" => $small[1]
]
],
"url" => "https://500px.com" . $image["canonicalPath"]
];
}
// get NPT token
if($json["data"]["photoSearch"]["pageInfo"]["hasNextPage"] === true){
$out["npt"] =
$this->backend->store(
json_encode([
"cursor" => $json["data"]["photoSearch"]["pageInfo"]["endCursor"],
"search" => $search,
"sort" => $pagination["sort"],
"filters" => [],
"nlp" => false
]),
"images",
$proxy
);
}
return $out;
}
private function image_ratio($longest_edge, $width, $height){
$ratio = [
$longest_edge / $width,
$longest_edge / $height
];
if($ratio[0] < $ratio[1]){
$ratio = $ratio[0];
}else{
$ratio = $ratio[1];
}
return [
floor($width * $ratio),
floor($height * $ratio)
];
}
}

View File

@ -136,7 +136,7 @@ class ftm{
"source" => [
[
"url" =>
"https://findthatmeme.us-southeast-1.linodeobjects.com/" .
"https://s3.thehackerblog.com/findthatmeme/" .
$thumb,
"width" => null,
"height" => null

View File

@ -2531,6 +2531,8 @@ class google{
"div"
);
$date = null;
if(count($date_div) !== 0){
foreach($date_div as $div){
@ -2541,6 +2543,7 @@ class google{
"bottom:"
) !== false
){
$date =
strtotime(
$this->fuckhtml
@ -2548,7 +2551,6 @@ class google{
$div
)
);
break;
}
}
@ -4147,7 +4149,7 @@ class google{
throw new Exception("Failed to get HTML");
}
//$html = file_get_contents("scraper/google.html");
//$html = file_get_contents("scraper/google.txt");
return $this->parsepage($html, "web", $search, $proxy, $params);
}

View File

@ -227,7 +227,7 @@ class marginalia{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage
"https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
"https://api.marginalia-search.com/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
[
"count" => 20
]
@ -279,7 +279,7 @@ class marginalia{
$html =
$this->get(
$proxy,
"https://search.marginalia.nu/search?" . $params
"https://old-search.marginalia.nu/search?" . $params
);
}catch(Exception $error){
@ -308,7 +308,7 @@ class marginalia{
$html =
$this->get(
$proxy,
"https://search.marginalia.nu/search",
"https://old-search.marginalia.nu/search",
$params
);
}catch(Exception $error){

View File

@ -13,31 +13,104 @@ class pinterest{
return [];
}
private function get($proxy, $url, $get = []){
private function get($proxy, $url, $get = [], &$cookies, $header_data_post = null){
$curlproc = curl_init();
if($get !== []){
if($header_data_post === null){
// handling GET
// extract cookies
$cookies_tmp = [];
curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
$length = strlen($header);
$header = explode(":", $header, 2);
if(trim(strtolower($header[0])) == "set-cookie"){
$cookie_tmp = explode("=", trim($header[1]), 2);
$cookies_tmp[trim($cookie_tmp[0])] =
explode(";", $cookie_tmp[1], 2)[0];
}
return $length;
});
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/javascript, */*, q=0.01",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://ca.pinterest.com/",
"X-Requested-With: XMLHttpRequest",
"X-APP-VERSION: 78f8764",
"X-Pinterest-AppState: active",
"X-Pinterest-Source-Url: /",
"X-Pinterest-PWS-Handler: www/index.js",
"screen-dpr: 1",
"is-preload-enabled: 1",
"DNT: 1",
"Sec-GPC: 1",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"Connection: keep-alive",
"Alt-Used: ca.pinterest.com",
"Priority: u=0",
"TE: trailers"]
);
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
}else{
// handling POST (pagination)
$get = http_build_query($get);
$url .= "?" . $get;
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/javascript, */*, q=0.01",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Content-Type: application/x-www-form-urlencoded",
"Content-Length: " . strlen($get),
"Referer: https://ca.pinterest.com/",
"X-Requested-With: XMLHttpRequest",
"X-APP-VERSION: 78f8764",
"X-CSRFToken: " . $cookies["csrf"],
"X-Pinterest-AppState: active",
"X-Pinterest-Source-Url: /search/pins/?rs=ac&len=2&q=" . urlencode($header_data_post) . "&eq=" . urlencode($header_data_post),
"X-Pinterest-PWS-Handler: www/search/[scope].js",
"screen-dpr: 1",
"is-preload-enabled: 1",
"Origin: https://ca.pinterest.com",
"DNT: 1",
"Sec-GPC: 1",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"Connection: keep-alive",
"Alt-Used: ca.pinterest.com",
"Cookie: " . $cookies["cookie"],
"TE: trailers"]
);
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
@ -54,6 +127,26 @@ class pinterest{
throw new Exception(curl_error($curlproc));
}
if($header_data_post === null){
if(!isset($cookies_tmp["csrftoken"])){
throw new Exception("Failed to grep CSRF token");
}
$cookies = "";
foreach($cookies_tmp as $cookie_name => $cookie_value){
$cookies .= $cookie_name . "=" . $cookie_value . "; ";
}
$cookies = [
"csrf" => $cookies_tmp["csrftoken"],
"cookie" => rtrim($cookies, " ;")
];
}
curl_close($curlproc);
return $data;
}
@ -62,17 +155,68 @@ class pinterest{
if($get["npt"]){
// @TODO
// post data for next page
$data = [
"source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
"data" =>
json_encode(
[$data, $proxy] =
$this->backend->get(
$get["npt"], "images"
);
$data = json_decode($data, true);
$search = $data["q"];
$cookies = $data["cookies"];
try{
$json =
$this->get(
$proxy,
"https://ca.pinterest.com/resource/BaseSearchResource/get/",
[
// {"options":{"applied_filters":null,"appliedProductFilters":"---","article":null,"auto_correction_disabled":false,"corpus":null,"customized_rerank_type":null,"domains":null,"filters":null,"journey_depth":null,"page_size":null,"price_max":null,"price_min":null,"query_pin_sigs":null,"query":"higurashi","redux_normalize_feed":true,"rs":"typed","scope":"pins","selected_one_bar_modules":null,"source_id":null,"source_module_id":null,"top_pin_id":null,"bookmarks":["Y2JVSG81V2sxcmNHRlpWM1J5VFVad1ZsWlVRbXhpVmtreVZsZHpOV0pIU2tkV2FscFhVbXhhVkZreU1WSmtNREZWVjIxR1RrMXNTbEJXYlhSaFVtMVdjMVZ1U2xaaWEzQnpXVlJPVTJWV1pISlhhM1JYVm10V05sVldVbE5XVjBwMVVXMUdWVll6VFhoVWJYaFhWMVp3Ums1V1RsTmlSbGt5Vm10YWFtVkdWbkpOU0dSUFZsZG9XRmxzWkc5VlZscHlWbGhrYkdKR1NubFdWelZQWVVaYWRHVkVRbFppUmtwVVZrUktWMlJIVWtWV2JHaHBVakZLU0Zkc1pEUmtNVnBZVW10b2FsSXdXbkJXYlRWRFpHeGFSMWRzVG1oaGVrWllXV3RvVTFVeFpFaFZiRUpoVm5wRk1GbHFSbXRYVjA1R1YyczFWMVpHV2pSWFZtaDNVakZrY2sxWVRsaGlhM0JXV1ZSR1MyRkdiRlZTYm1SVVVteHdXbGxWVlRGVk1VbDVWRmhrVjAxdVVuWlVhMXBTWlVaT2MxcEhSbE5TTWswMVdtdGFWMU5YU2paVmJYaFRUVmhDUjFZeU5YZFVNVkY0VjJ0b1ZXRnJOVlpVVmxwTFVURndXR042VmxOV2ExcGFXVlZWTlZVeFNYZE5WRTVYVWtWYVZGWkhNVTlXTVU1WllVWk9hR1ZyV2s1WFZ6QXhZakpPVjFWWWFHRlNWbkJRVm14U1IwMUdXWGxOVkVKVlRWWnNORll5TURWV1YwVjVWV3hDV21FeGNETmFSVnByVjFkS1IyTkhhR2xYUjJkM1ZtdGFhMlF4VVhsVGJGcE9Wa1p3YjFwWGVFdFZWbFp4VW14YWJGWnRVbHBaTUdoTFZHMUtTR1ZJYUZkV2VrWjJWMVphU21ReVJYcGpSbFpwVW10d1RGZHJVa0pPVms1SFZHNVNUbFl3V2xoVmJYUldaVVpaZUZremFGUk5hM0JYVkZaYVYyRkZNSGxWYkVKYVlrWlZlRnBGV210WFIwNUpVMnMxVTFaR1dscFdWekI0VFVaV1IxTllaR3BUUlhCb1dWUkdWbVZHVm5SbFJuQnNZbFpKTWxSVlVYaFBSVGxGV1hwR1QyVnJSVEZVVlZKT1RrVXhSVkpVUWs5bGJFVXhWRmhzZDFOR1ZsWmtNMFp0VWpGYWIxZFhjRXBsUlRGSVZWaHdUbFl4YTNoVVZWSnFUVVUxV0ZadGFFOVNSVnB6Vkd0a1drMUdiRFpUVkVaT1pXMWplRmRzVWxkaFJuQllWVlJTVDJWdFRqWlVNVkpTWlZad2NWcEhkRTlsYTFwMFZGVlNhMkpWTVZWVFZFcE9Wa1pzTmxkWE1WSk9WVEYwVlcweFVGWXdXVFJXUjNSWFYwZGFRbEJVTVRoUFJHTXhUbnBCTlUxRVRUUk5SRVV3VG5wUk5VMTVjRWhWVlhkeFprUlZlRTlFVVRKWlZHc3lUMWRSTWsxVVVUSk9iVnBvV1RKWmVrNTZXWGhPTWs1cFQwUkZNVTlFVm1sTlZGcHBUV3BTYTFsWFRtcE9SR015VG1wVk5GbHFaR2haVjFacldWUmFiVmxxWkdoYVZGWnFUa1JXT0ZSclZsaG1RVDA5fFVIbzVhRkpYZUc1WFYyUlpWVEpHYkdGNk1XWk5ha1ptVFZSR09FOUVZekZPZWtFMVRVUk5ORTFFUlRCT2VsRTFUWGx3U0ZWVmQzRm1SMWw1VFZSUk1WbDZUVEJhUjFGNVQxZFNhVnB0VlRGT1JFVXdXVlJuZVU1cVRUUk5hbU40VDBSSk1VNXFWVEZOYlZwcVdsUnJlRTFFVVhwWmVsVjNXbXBvYkU1dFJYbE9ha0Y2VDFSSk5VMTZWVEJaYWtJNFZHdFdXR1pCUFQwPXxOb25lfDg3NTcwOTAzODAxNDc0OTMqR1FMKnwzMjM3YjM3ZGNhMGU3YjYyYzYzYzAyZGJkNGU1MjdlNzMyMTExMTNlMmUyMzEyOWM2MDAzYmU1ZTlmZjkwYjAwfE5FV3w="]},"context":{}}
]
"source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
"data" => json_encode(
[
"options" => [
"applied_unified_filters" => null,
"appliedProductFilters" => "---",
"article" => null,
"auto_correction_disabled" => false,
"corpus" => null,
"customized_rerank_type" => null,
"domains" => null,
"dynamicPageSizeExpGroup" => null,
"filters" => null,
"journey_depth" => null,
"page_size" => null,
"price_max" => null,
"price_min" => null,
"query_pin_sigs" => null,
"query" => $data["q"],
"redux_normalize_feed" => true,
"request_params" => null,
"rs" => "typed",
"scope" => "pins",
"selected_one_bar_modules" => null,
"source_id" => null,
"source_module_id" => null,
"source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
"top_pin_id" => null,
"top_pin_ids" => null,
"bookmarks" => [
$data["bookmark"]
]
],
"context" => []
],
JSON_UNESCAPED_SLASHES
)
],
$cookies,
$search
);
];
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
}else{
@ -82,26 +226,44 @@ class pinterest{
throw new Exception("Search term is empty!");
}
// https://ca.pinterest.com/resource/BaseSearchResource/get/?source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac&data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D&_=1736116313987
// source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac
// &data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D
// &_=1736116313987
$source_url = "/search/pins/?q=" . urlencode($search) . "&rs=" . urlencode($search);
$filter = [
"source_url" => "/search/pins/?q=" . urlencode($search),
"source_url" => $source_url,
"rs" => "typed",
"data" =>
json_encode(
[
"options" => [
"article" => null,
"applied_filters" => null,
"applied_unified_filters" => null,
"appliedProductFilters" => "---",
"auto_correction_disabled" => false,
"article" => null,
"corpus" => null,
"customized_rerank_type" => null,
"domains" => null,
"dynamicPageSizeExpGroup" => null,
"filters" => null,
"query" => $search,
"journey_depth" => null,
"page_size" => null,
"price_max" => null,
"price_min" => null,
"query_pin_sigs" => null,
"query" => $search,
"redux_normalize_feed" => true,
"rs" => "typed",
"request_params" => null,
"rs" => "ac",
"scope" => "pins", // pins, boards, videos,
"source_id" => null
"selected_one_bar_modules" => null,
"source_id" => null,
"source_module_id" => null,
"source_url" => $source_url,
"top_pin_id" => null,
"top_pin_ids" => null
],
"context" => []
]
@ -110,24 +272,26 @@ class pinterest{
];
$proxy = $this->backend->get_ip();
}
$cookies = [];
try{
$json =
json_decode(
try{
$json =
$this->get(
$proxy,
"https://www.pinterest.ca/resource/BaseSearchResource/get/",
$filter
),
true
);
"https://ca.pinterest.com/resource/BaseSearchResource/get/",
$filter,
$cookies,
null
);
}catch(Exception $error){
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
throw new Exception("Failed to fetch JSON");
}
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
@ -139,6 +303,60 @@ class pinterest{
"image" => []
];
if(
!isset(
$json["resource_response"]
["status"]
)
){
throw new Exception("Unknown API failure");
}
if($json["resource_response"]["status"] != "success"){
$status = "Got non-OK response: " . $json["resource_response"]["status"];
if(
isset(
$json["resource_response"]["message"]
)
){
$status .= " - " . $json["resource_response"]["message"];
}
throw new Exception($status);
}
if(
isset(
$json["resource_response"]["sensitivity"]
["notices"][0]["description"]["text"]
)
){
throw new Exception(
"Pinterest returned a notice: " .
$json["resource_response"]["sensitivity"]["notices"][0]["description"]["text"]
);
}
// get NPT
if(isset($json["resource_response"]["bookmark"])){
$out["npt"] =
$this->backend->store(
json_encode([
"q" => $search,
"bookmark" => $json["resource_response"]["bookmark"],
"cookies" => $cookies
]),
"images",
$proxy
);
}
foreach(
$json
["resource_response"]
@ -150,6 +368,7 @@ class pinterest{
switch($item["type"]){
case "pin":
case "board":
/*
Handle image object
@ -206,42 +425,15 @@ class pinterest{
"height" => (int)$thumb["height"]
]
],
"url" => "https://www.pinterest.com/pin/" . $item["id"]
"url" =>
$item["link"] === null ?
"https://ca.pinterest.com/pin/" . $item["id"] :
$item["link"]
];
break;
case "board":
if(isset($item["cover_pin"]["image_url"])){
$image = [
"url" => $item["cover_pin"]["image_url"],
"width" => (int)$item["cover_pin"]["size"][0],
"height" => (int)$item["cover_pin"]["size"][1]
];
}elseif(isset($item["image_cover_url_hd"])){
/*
$image = [
"url" =>
"width" => null,
"height" => null
];*/
}
break;
}
}
return $out;
}
private function getfullresimage($image, $has_og){
$has_og = $has_og ? "1200x" : "originals";
return
preg_replace(
'/https:\/\/i\.pinimg\.com\/[^\/]+\//',
"https://i.pinimg.com/" . $has_og . "/",
$image
);
}
}

257
scraper/vsco.php Normal file
View File

@ -0,0 +1,257 @@
<?php
class vsco{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("vsco");
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = [], $bearer = null){
$curlproc = curl_init();
if($get !== []){
$get_tmp = http_build_query($get);
$url .= "?" . $get_tmp;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($bearer === null){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i",
"TE: trailers"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US",
"Accept-Encoding: gzip",
"Referer: https://vsco.co/search/images/" . urlencode($get["query"]),
"authorization: Bearer " . $bearer,
"content-type: application/json",
"x-client-build: 1",
"x-client-platform: web",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"Priority: u=0",
"TE: trailers"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function image($get){
if($get["npt"]){
[$data, $proxy] =
$this->backend->get(
$get["npt"], "images"
);
$data = json_decode($data, true);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
// get bearer token
try{
$html =
$this->get(
$proxy,
"https://vsco.co/feed"
);
}catch(Exception $error){
throw new Exception("Failed to fetch feed page");
}
preg_match(
'/"tkn":"([A-z0-9]+)"/',
$html,
$bearer
);
if(!isset($bearer[1])){
throw new Exception("Failed to grep bearer token");
}
$data = [
"pagination" => [
"query" => $search,
"page" => 0,
"size" => 100
],
"bearer" => $bearer[1]
];
}
try{
$json =
$this->get(
$proxy,
"https://vsco.co/api/2.0/search/images",
$data["pagination"],
$data["bearer"]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if(!isset($json["results"])){
throw new Exception("Failed to access results object");
}
foreach($json["results"] as $image){
$image_domain = parse_url("https://" . $image["responsive_url"], PHP_URL_HOST);
$thumbnail = explode($image_domain, $image["responsive_url"], 2)[1];
if(substr($thumbnail, 0, 3) != "/1/"){
$thumbnail =
preg_replace(
'/^\/[^\/]+/',
"",
$thumbnail
);
}
$thumbnail = "https://img.vsco.co/cdn-cgi/image/width=480,height=360" . $thumbnail;
$size =
$this->image_ratio(
(int)$image["dimensions"]["width"],
(int)$image["dimensions"]["height"]
);
$out["image"][] = [
"title" => $image["description"],
"source" => [
[
"url" => "https://" . $image["responsive_url"],
"width" => (int)$image["dimensions"]["width"],
"height" => (int)$image["dimensions"]["height"]
],
[
"url" => $thumbnail,
"width" => $size[0],
"height" => $size[1]
]
],
"url" => "https://" . $image["grid"]["domain"] . "/media/" . $image["imageId"]
];
}
// get NPT
$max_page = ceil($json["total"] / 100);
$data["pagination"]["page"]++;
if($max_page > $data["pagination"]["page"]){
$out["npt"] =
$this->backend->store(
json_encode($data),
"images",
$proxy
);
}
return $out;
}
private function image_ratio($width, $height){
$ratio = [
480 / $width,
360 / $height
];
if($ratio[0] < $ratio[1]){
$ratio = $ratio[0];
}else{
$ratio = $ratio[1];
}
return [
floor($width * $ratio),
floor($height * $ratio)
];
}
}

View File

@ -1209,15 +1209,16 @@ class yt{
$reel =
$reel
->reelItemRenderer;
->shortsLockupViewModel;
array_push(
$this->out["reel"],
[
"title" =>
$reel
->headline
->simpleText,
->overlayMetadata
->primaryText
->content,
"description" => null,
"author" => [
"name" => null,
@ -1225,30 +1226,22 @@ class yt{
"avatar" => null
],
"date" => null,
"duration" =>
$this->textualtime2int(
$reel
->accessibility
->accessibilityData
->label
),
"views" =>
$this->truncatedcount2int(
$reel
->viewCountText
->simpleText
),
"duration" => null,
"views" => null,
"thumb" => [
"url" =>
$reel
->thumbnail
->thumbnails[0]
->sources[0]
->url,
"ratio" => "9:16"
],
"url" =>
"https://www.youtube.com/watch?v=" .
$reel
->onTap
->innertubeCommand
->reelWatchEndpoint
->videoId
]
);

View File

@ -227,10 +227,18 @@ $settings = [
"value" => "solofield",
"text" => "Solofield"
],
/*[
[
"value" => "pinterest",
"text" => "Pinterest"
],*/
],
[
"value" => "fivehpx",
"text" => "500px"
],
[
"value" => "vsco",
"text" => "VSCO"
],
[
"value" => "imgur",
"text" => "Imgur"

View File

@ -16,6 +16,7 @@
body{
padding:15px 4% 40px;
margin:unset;
}
h1,h2,h3,h4,h5,h6{