forked from lolcat/4get
Compare commits
No commits in common. "86f8edda349b2158167253d08eaac502c6a40f42" and "631aa585654efbe91d850a0260b69a82924db6d1" have entirely different histories.
86f8edda34
...
631aa58565
|
@ -1 +0,0 @@
|
||||||
.git
|
|
|
@ -1,48 +0,0 @@
|
||||||
name: '4get CI'
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- '*'
|
|
||||||
paths-ignore:
|
|
||||||
- 'README.md'
|
|
||||||
- 'docker-compose.yaml'
|
|
||||||
- '.gitignore'
|
|
||||||
- 'docs/**'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build:
|
|
||||||
runs-on: docker
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
name: Checkout 4get repository
|
|
||||||
|
|
||||||
- uses: docker/setup-buildx-action@v3
|
|
||||||
name: Setup Docker BuildX system
|
|
||||||
|
|
||||||
- name: Login to Docker Container Registry
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
registry: git.lolcat.ca
|
|
||||||
username: ${{ secrets.USERNAME }}
|
|
||||||
password: ${{ secrets.TOKEN }}
|
|
||||||
|
|
||||||
- name: Docker meta
|
|
||||||
id: meta
|
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: git.lolcat.ca/lolcat/4get
|
|
||||||
tags: |
|
|
||||||
type=sha,format=short,prefix={{date 'YYYY.MM.DD'}}-,enable=${{ github.ref == format('refs/heads/{0}', 'master') }}
|
|
||||||
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'master') }}
|
|
||||||
|
|
||||||
- uses: docker/build-push-action@v6
|
|
||||||
name: Build images
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
file: Dockerfile
|
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
|
||||||
platforms: linux/amd64
|
|
||||||
push: true
|
|
|
@ -4,6 +4,7 @@ WORKDIR /var/www/html/4get
|
||||||
RUN apk update && apk upgrade
|
RUN apk update && apk upgrade
|
||||||
RUN apk add php apache2-ssl php83-fileinfo php83-openssl php83-iconv php83-common php83-dom php83-sodium php83-curl curl php83-pecl-apcu php83-apache2 imagemagick php83-pecl-imagick php-mbstring imagemagick-webp imagemagick-jpeg
|
RUN apk add php apache2-ssl php83-fileinfo php83-openssl php83-iconv php83-common php83-dom php83-sodium php83-curl curl php83-pecl-apcu php83-apache2 imagemagick php83-pecl-imagick php-mbstring imagemagick-webp imagemagick-jpeg
|
||||||
|
|
||||||
|
COPY ./docker/apache/ /etc/apache2/
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN chmod 777 /var/www/html/4get/icons
|
RUN chmod 777 /var/www/html/4get/icons
|
||||||
|
@ -13,5 +14,4 @@ EXPOSE 443
|
||||||
|
|
||||||
ENV FOURGET_PROTO=http
|
ENV FOURGET_PROTO=http
|
||||||
|
|
||||||
ENTRYPOINT ["./docker/docker-entrypoint.sh"]
|
CMD ["./docker/docker-entrypoint.sh"]
|
||||||
CMD ["start"]
|
|
||||||
|
|
18
README.md
18
README.md
|
@ -9,11 +9,9 @@ https://4get.ca/about
|
||||||
## Official instance
|
## Official instance
|
||||||
https://4get.ca , or visit the official instance list: https://4get.ca/instances
|
https://4get.ca , or visit the official instance list: https://4get.ca/instances
|
||||||
|
|
||||||
_NOT to be confused with 4get.ch, 4get.lol and friends! I **don't** host these._
|
|
||||||
|
|
||||||
## Totally unbiased comparison between alternatives
|
## Totally unbiased comparison between alternatives
|
||||||
|
|
||||||
| | 4get | searx(ng) | libreY | araa | hearch.co |
|
| | 4get | searx(ng) | libreY | araa | hearch |
|
||||||
|----------------------------|-------------------------|-----------|-------------|-----------|-------------------|
|
|----------------------------|-------------------------|-----------|-------------|-----------|-------------------|
|
||||||
| RAM usage | 200-400mb~ | 2GB~ | 200-400mb~ | 2GB~ | idk |
|
| RAM usage | 200-400mb~ | 2GB~ | 200-400mb~ | 2GB~ | idk |
|
||||||
| Does it suck | no (debunked by snopes) | yes | yes | a little | better than searx |
|
| Does it suck | no (debunked by snopes) | yes | yes | a little | better than searx |
|
||||||
|
@ -25,9 +23,9 @@ _NOT to be confused with 4get.ch, 4get.lol and friends! I **don't** host these._
|
||||||
3. Bot protection that *actually* filters out the bots (when configured)
|
3. Bot protection that *actually* filters out the bots (when configured)
|
||||||
4. Interface doesn't require javascript
|
4. Interface doesn't require javascript
|
||||||
5. Favicon fetcher with caching support & image proxy
|
5. Favicon fetcher with caching support & image proxy
|
||||||
6. Bunch of other shits
|
6. Bunch of other shit
|
||||||
|
|
||||||
tl;dr 4get is the best way to browse for shit.
|
tl;dr the best way to actually browse for shit.
|
||||||
|
|
||||||
# Supported websites
|
# Supported websites
|
||||||
|
|
||||||
|
@ -42,10 +40,10 @@ tl;dr 4get is the best way to browse for shit.
|
||||||
| Ghostery | Yep | Qwant | | | Qwant |
|
| Ghostery | Yep | Qwant | | | Qwant |
|
||||||
| Yep | Solofield | Solofield | | | Ghostery |
|
| Yep | Solofield | Solofield | | | Ghostery |
|
||||||
| Greppr | Pinterest | | | | Yep |
|
| Greppr | Pinterest | | | | Yep |
|
||||||
| Crowdview | 500px | | | | Marginalia |
|
| Crowdview | Imgur | | | | Marginalia |
|
||||||
| Mwmbl | VSCO | | | | YouTube |
|
| Mwmbl | FindThatMeme | | | | YouTube |
|
||||||
| Mojeek | Imgur | | | | Soundcloud |
|
| Mojeek | | | | | Soundcloud |
|
||||||
| Solofield | FindThatMeme | | | | |
|
| Solofield | | | | | |
|
||||||
| Marginalia | | | | | |
|
| Marginalia | | | | | |
|
||||||
| wiby | | | | | |
|
| wiby | | | | | |
|
||||||
| Curlie | | | | | |
|
| Curlie | | | | | |
|
||||||
|
@ -54,7 +52,7 @@ tl;dr 4get is the best way to browse for shit.
|
||||||
Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/">documentation index</a>. I recommend following the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">apache2 guide</a>.
|
Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/">documentation index</a>. I recommend following the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">apache2 guide</a>.
|
||||||
|
|
||||||
## Contact
|
## Contact
|
||||||
Shit breaks all the time but I repair it all the time too. Email me here: <b>will (at) lolcat.ca</b> or create an issue.
|
Shit breaks all the time but I repair it all the time too... Email me here: <b>will (at) lolcat.ca</b> or create an issue.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
AGPL
|
AGPL
|
||||||
|
|
|
@ -119,7 +119,7 @@ class config{
|
||||||
|
|
||||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||||
// Changing this might break things.
|
// Changing this might break things.
|
||||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0";
|
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||||
|
|
||||||
// Proxy pool assignments for each scraper
|
// Proxy pool assignments for each scraper
|
||||||
// false = Use server's raw IP
|
// false = Use server's raw IP
|
||||||
|
@ -143,8 +143,6 @@ class config{
|
||||||
const PROXY_YT = false; // youtube
|
const PROXY_YT = false; // youtube
|
||||||
const PROXY_YEP = false;
|
const PROXY_YEP = false;
|
||||||
const PROXY_PINTEREST = false;
|
const PROXY_PINTEREST = false;
|
||||||
const PROXY_FIVEHPX = false;
|
|
||||||
const PROXY_VSCO = false;
|
|
||||||
const PROXY_SEZNAM = false;
|
const PROXY_SEZNAM = false;
|
||||||
const PROXY_NAVER = false;
|
const PROXY_NAVER = false;
|
||||||
const PROXY_GREPPR = false;
|
const PROXY_GREPPR = false;
|
||||||
|
|
|
@ -6,15 +6,14 @@ services:
|
||||||
image: luuul/4get:latest
|
image: luuul/4get:latest
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- FOURGET_PROTO=http
|
|
||||||
- FOURGET_SERVER_NAME=4get.ca
|
- FOURGET_SERVER_NAME=4get.ca
|
||||||
- FOURGET_INSTANCES=https://4get.ca
|
|
||||||
|
|
||||||
ports:
|
ports:
|
||||||
- "80:80"
|
- "80:80"
|
||||||
- "443:443"
|
- "443:443"
|
||||||
|
|
||||||
# volumes:
|
volumes:
|
||||||
# - /etc/letsencrypt/live/domain.tld:/etc/4get/certs # mount ssl
|
- /etc/letsencrypt/live/domain.tld:/etc/4get/certs
|
||||||
# - ./banners:/var/www/html/4get/banner # mount custom banners
|
# mount custom banners and captcha
|
||||||
# - ./captcha:/var/www/html/4get/data/captcha # mount captcha images
|
- ./banners:/var/www/html/4get/banner
|
||||||
|
- ./captcha:/var/www/html/4get/data/captcha
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
# intentionally blank
|
|
|
@ -8,27 +8,18 @@ FOURGET_PROTO="${FOURGET_PROTO#\"}"
|
||||||
# make lowercase
|
# make lowercase
|
||||||
FOURGET_PROTO=`echo $FOURGET_PROTO | awk '{print tolower($0)}'`
|
FOURGET_PROTO=`echo $FOURGET_PROTO | awk '{print tolower($0)}'`
|
||||||
|
|
||||||
FOURGET_SRC='/var/www/html/4get'
|
|
||||||
|
|
||||||
mkdir -p /etc/apache2
|
|
||||||
|
|
||||||
if [ "$FOURGET_PROTO" = "https" ]; then
|
if [ "$FOURGET_PROTO" = "https" ]; then
|
||||||
echo "Using https configuration"
|
echo "Using https configuration"
|
||||||
cp -r ${FOURGET_SRC}/docker/apache/https/httpd.conf /etc/apache2
|
cp /etc/apache2/https.conf /etc/apache2/httpd.conf
|
||||||
cp -r ${FOURGET_SRC}/docker/apache/https/conf.d/* /etc/apache2/conf.d
|
|
||||||
|
|
||||||
else
|
else
|
||||||
echo "Using http configuration"
|
echo "Using http configuration"
|
||||||
cp -r ${FOURGET_SRC}/docker/apache/http/httpd.conf /etc/apache2
|
cp /etc/apache2/http.conf /etc/apache2/httpd.conf
|
||||||
cp -r ${FOURGET_SRC}/docker/apache/http/conf.d/* /etc/apache2/conf.d
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
php ./docker/gen_config.php
|
php ./docker/gen_config.php
|
||||||
|
|
||||||
if [ "$@" = "start" ]; then
|
|
||||||
echo "4get is running"
|
echo "4get is running"
|
||||||
exec httpd -DFOREGROUND
|
exec httpd -DFOREGROUND
|
||||||
else
|
|
||||||
exec "$@"
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
|
@ -970,8 +970,6 @@ class frontend{
|
||||||
"yep" => "Yep",
|
"yep" => "Yep",
|
||||||
"solofield" => "Solofield",
|
"solofield" => "Solofield",
|
||||||
"pinterest" => "Pinterest",
|
"pinterest" => "Pinterest",
|
||||||
"fivehpx" => "500px",
|
|
||||||
"vsco" => "VSCO",
|
|
||||||
"imgur" => "Imgur",
|
"imgur" => "Imgur",
|
||||||
"ftm" => "FindThatMeme"
|
"ftm" => "FindThatMeme"
|
||||||
]
|
]
|
||||||
|
|
|
@ -210,63 +210,6 @@ class brave{
|
||||||
return $data;
|
return $data;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function get_js(){
|
|
||||||
|
|
||||||
$script_disc =
|
|
||||||
$this->fuckhtml
|
|
||||||
->getElementsByTagName(
|
|
||||||
"script"
|
|
||||||
);
|
|
||||||
|
|
||||||
$data = null;
|
|
||||||
foreach($script_disc as &$discs){
|
|
||||||
|
|
||||||
if(
|
|
||||||
preg_match(
|
|
||||||
'/kit\.start\(/',
|
|
||||||
$discs["innerHTML"]
|
|
||||||
)
|
|
||||||
){
|
|
||||||
|
|
||||||
$data =
|
|
||||||
explode(
|
|
||||||
"data:",
|
|
||||||
$discs["innerHTML"],
|
|
||||||
2
|
|
||||||
);
|
|
||||||
|
|
||||||
if(count($data) !== 2){
|
|
||||||
|
|
||||||
throw new Exception("Failed to split up data field");
|
|
||||||
}
|
|
||||||
|
|
||||||
$data = $data[1];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if($data === null){
|
|
||||||
|
|
||||||
throw new Exception("Could not grep JavaScript object");
|
|
||||||
}
|
|
||||||
|
|
||||||
$data =
|
|
||||||
$this->fuckhtml
|
|
||||||
->parseJsObject(
|
|
||||||
$this->fuckhtml
|
|
||||||
->extract_json(
|
|
||||||
$data
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
if($data === null){
|
|
||||||
|
|
||||||
throw new Exception("Failed to decode JavaScript object");
|
|
||||||
}
|
|
||||||
|
|
||||||
return $data;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function web($get){
|
public function web($get){
|
||||||
|
|
||||||
if($get["npt"]){
|
if($get["npt"]){
|
||||||
|
@ -403,7 +346,7 @@ class brave{
|
||||||
|
|
||||||
$nextpage =
|
$nextpage =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->getElementsByClassName("button", "a");
|
->getElementsByClassName("btn", "a");
|
||||||
|
|
||||||
if(count($nextpage) !== 0){
|
if(count($nextpage) !== 0){
|
||||||
|
|
||||||
|
@ -439,9 +382,55 @@ class brave{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// do some magic
|
|
||||||
$this->fuckhtml->load($html);
|
$this->fuckhtml->load($html);
|
||||||
$data = $this->get_js();
|
|
||||||
|
$script_disc =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByTagName(
|
||||||
|
"script"
|
||||||
|
);
|
||||||
|
|
||||||
|
$grep = [];
|
||||||
|
foreach($script_disc as $discs){
|
||||||
|
|
||||||
|
preg_match(
|
||||||
|
'/const data ?= ?(\[{.*}]);/',
|
||||||
|
$discs["innerHTML"],
|
||||||
|
$grep
|
||||||
|
);
|
||||||
|
|
||||||
|
if(isset($grep[1])){
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!isset($grep[1])){
|
||||||
|
|
||||||
|
throw new Exception("Could not grep JavaScript object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$data =
|
||||||
|
rtrim(
|
||||||
|
preg_replace(
|
||||||
|
'/\(Array\(0\)\)\).*$/',
|
||||||
|
"",
|
||||||
|
$grep[1]
|
||||||
|
),
|
||||||
|
" ]"
|
||||||
|
) . "]";
|
||||||
|
|
||||||
|
$data =
|
||||||
|
$this->fuckhtml
|
||||||
|
->parseJsObject(
|
||||||
|
$data
|
||||||
|
);
|
||||||
|
unset($grep);
|
||||||
|
|
||||||
|
if($data === null){
|
||||||
|
|
||||||
|
throw new Exception("Failed to decode JavaScript object");
|
||||||
|
}
|
||||||
|
|
||||||
if(
|
if(
|
||||||
isset($data[2]["data"]["title"]) &&
|
isset($data[2]["data"]["title"]) &&
|
||||||
|
@ -1190,8 +1179,23 @@ class brave{
|
||||||
$proxy
|
$proxy
|
||||||
);
|
);
|
||||||
|
|
||||||
$this->fuckhtml->load($html);
|
preg_match(
|
||||||
$json = $this->get_js();
|
'/const data ?= ?(\[{.*}]);/',
|
||||||
|
$html,
|
||||||
|
$json
|
||||||
|
);
|
||||||
|
|
||||||
|
if(!isset($json[1])){
|
||||||
|
|
||||||
|
throw new Exception("Failed to grep javascript object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$json = $this->fuckhtml->parseJsObject($json[1], true);
|
||||||
|
|
||||||
|
if($json === null){
|
||||||
|
|
||||||
|
throw new Exception("Failed to parse javascript object");
|
||||||
|
}
|
||||||
|
|
||||||
foreach(
|
foreach(
|
||||||
$json[1]["data"]["body"]["response"]["news"]["results"]
|
$json[1]["data"]["body"]["response"]["news"]["results"]
|
||||||
|
@ -1273,8 +1277,22 @@ class brave{
|
||||||
$html = fread($handle, filesize("scraper/brave-image.html"));
|
$html = fread($handle, filesize("scraper/brave-image.html"));
|
||||||
fclose($handle);*/
|
fclose($handle);*/
|
||||||
|
|
||||||
$this->fuckhtml->load($html);
|
preg_match(
|
||||||
$json = $this->get_js();
|
'/const data = (\[{.*}\]);/',
|
||||||
|
$html,
|
||||||
|
$json
|
||||||
|
);
|
||||||
|
|
||||||
|
if(!isset($json[1])){
|
||||||
|
|
||||||
|
throw new Exception("Failed to get data object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$json =
|
||||||
|
$this->fuckhtml
|
||||||
|
->parseJsObject(
|
||||||
|
$json[1]
|
||||||
|
);
|
||||||
|
|
||||||
foreach(
|
foreach(
|
||||||
$json[1]
|
$json[1]
|
||||||
|
@ -1404,8 +1422,22 @@ class brave{
|
||||||
$html = fread($handle, filesize("scraper/brave-video.html"));
|
$html = fread($handle, filesize("scraper/brave-video.html"));
|
||||||
fclose($handle);*/
|
fclose($handle);*/
|
||||||
|
|
||||||
$this->fuckhtml->load($html);
|
preg_match(
|
||||||
$json = $this->get_js();
|
'/const data = (\[{.*}\]);/',
|
||||||
|
$html,
|
||||||
|
$json
|
||||||
|
);
|
||||||
|
|
||||||
|
if(!isset($json[1])){
|
||||||
|
|
||||||
|
throw new Exception("Failed to get data object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$json =
|
||||||
|
$this->fuckhtml
|
||||||
|
->parseJsObject(
|
||||||
|
$json[1]
|
||||||
|
);
|
||||||
|
|
||||||
foreach(
|
foreach(
|
||||||
$json
|
$json
|
||||||
|
@ -1777,21 +1809,7 @@ class brave{
|
||||||
|
|
||||||
$nextpage =
|
$nextpage =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->getElementById(
|
->getElementsByClassName("btn", "a");
|
||||||
"pagination",
|
|
||||||
"div"
|
|
||||||
);
|
|
||||||
|
|
||||||
if($nextpage){
|
|
||||||
|
|
||||||
$this->fuckhtml->load($nextpage);
|
|
||||||
|
|
||||||
$nextpage =
|
|
||||||
$this->fuckhtml
|
|
||||||
->getElementsByClassName(
|
|
||||||
"button",
|
|
||||||
"a"
|
|
||||||
);
|
|
||||||
|
|
||||||
if(count($nextpage) !== 0){
|
if(count($nextpage) !== 0){
|
||||||
|
|
||||||
|
@ -1829,7 +1847,6 @@ class brave{
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,262 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
class fivehpx{
|
|
||||||
|
|
||||||
public function __construct(){
|
|
||||||
|
|
||||||
include "lib/backend.php";
|
|
||||||
$this->backend = new backend("fivehpx");
|
|
||||||
|
|
||||||
include "lib/fuckhtml.php";
|
|
||||||
$this->fuckhtml = new fuckhtml();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getfilters($page){
|
|
||||||
|
|
||||||
return [
|
|
||||||
"sort" => [
|
|
||||||
"display" => "Sort",
|
|
||||||
"option" => [
|
|
||||||
"relevance" => "Relevance",
|
|
||||||
"pulse" => "Pulse",
|
|
||||||
"newest" => "Newest"
|
|
||||||
]
|
|
||||||
]
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
private function get($proxy, $url, $get = [], $post_data = null){
|
|
||||||
|
|
||||||
$curlproc = curl_init();
|
|
||||||
|
|
||||||
if($get !== []){
|
|
||||||
$get = http_build_query($get);
|
|
||||||
$url .= "?" . $get;
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
|
||||||
|
|
||||||
if($post_data === null){
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
|
||||||
["User-Agent: " . config::USER_AGENT,
|
|
||||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
|
||||||
"Accept-Encoding: gzip",
|
|
||||||
"DNT: 1",
|
|
||||||
"Sec-GPC: 1",
|
|
||||||
"Connection: keep-alive",
|
|
||||||
"Upgrade-Insecure-Requests: 1",
|
|
||||||
"Sec-Fetch-Dest: document",
|
|
||||||
"Sec-Fetch-Mode: navigate",
|
|
||||||
"Sec-Fetch-Site: same-origin",
|
|
||||||
"Sec-Fetch-User: ?1",
|
|
||||||
"Priority: u=0, i",
|
|
||||||
"TE: trailers"]
|
|
||||||
);
|
|
||||||
}else{
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
|
||||||
["User-Agent: " . config::USER_AGENT,
|
|
||||||
"Accept: */*",
|
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
|
||||||
"Accept-Encoding: gzip",
|
|
||||||
"Referer: https://500px.com/",
|
|
||||||
"content-type: application/json",
|
|
||||||
//"x-csrf-token: undefined",
|
|
||||||
"x-500px-source: Search",
|
|
||||||
"Content-Length: " . strlen($post_data),
|
|
||||||
"Origin: https://500px.com",
|
|
||||||
"DNT: 1",
|
|
||||||
"Sec-GPC: 1",
|
|
||||||
"Connection: keep-alive",
|
|
||||||
// "Cookie: _pin_unauth, _fbp, _sharedID, _sharedID_cst",
|
|
||||||
"Sec-Fetch-Dest: empty",
|
|
||||||
"Sec-Fetch-Mode: cors",
|
|
||||||
"Sec-Fetch-Site: same-site",
|
|
||||||
"Priority: u=4",
|
|
||||||
"TE: trailers"]
|
|
||||||
);
|
|
||||||
|
|
||||||
// set post data
|
|
||||||
curl_setopt($curlproc, CURLOPT_POST, true);
|
|
||||||
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
|
||||||
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
|
||||||
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
|
||||||
|
|
||||||
// http2 bypass
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
|
||||||
|
|
||||||
$this->backend->assign_proxy($curlproc, $proxy);
|
|
||||||
|
|
||||||
$data = curl_exec($curlproc);
|
|
||||||
|
|
||||||
if(curl_errno($curlproc)){
|
|
||||||
|
|
||||||
throw new Exception(curl_error($curlproc));
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_close($curlproc);
|
|
||||||
return $data;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function image($get){
|
|
||||||
|
|
||||||
if($get["npt"]){
|
|
||||||
|
|
||||||
[$pagination, $proxy] =
|
|
||||||
$this->backend->get(
|
|
||||||
$get["npt"], "images"
|
|
||||||
);
|
|
||||||
|
|
||||||
$pagination = json_decode($pagination, true);
|
|
||||||
$search = $pagination["search"];
|
|
||||||
|
|
||||||
}else{
|
|
||||||
|
|
||||||
$search = $get["s"];
|
|
||||||
if(strlen($search) === 0){
|
|
||||||
|
|
||||||
throw new Exception("Search term is empty!");
|
|
||||||
}
|
|
||||||
|
|
||||||
$proxy = $this->backend->get_ip();
|
|
||||||
$pagination = [
|
|
||||||
"sort" => strtoupper($get["sort"]),
|
|
||||||
"search" => $search,
|
|
||||||
"filters" => [],
|
|
||||||
"nlp" => false,
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
try{
|
|
||||||
|
|
||||||
$json =
|
|
||||||
$this->get(
|
|
||||||
$proxy,
|
|
||||||
"https://api.500px.com/graphql",
|
|
||||||
[],
|
|
||||||
json_encode([
|
|
||||||
"operationName" => "PhotoSearchPaginationContainerQuery",
|
|
||||||
"variables" => $pagination,
|
|
||||||
"query" =>
|
|
||||||
'query PhotoSearchPaginationContainerQuery(' .
|
|
||||||
(isset($pagination["cursor"]) ? '$cursor: String, ' : "") .
|
|
||||||
'$sort: PhotoSort, $search: String!, $filters: [PhotoSearchFilter!], $nlp: Boolean) { ...PhotoSearchPaginationContainer_query_1vzAZD} fragment PhotoSearchPaginationContainer_query_1vzAZD on Query { photoSearch(sort: $sort, first: 100, ' .
|
|
||||||
(isset($pagination["cursor"]) ? 'after: $cursor, ' : "") .
|
|
||||||
'search: $search, filters: $filters, nlp: $nlp) { edges { node { id legacyId canonicalPath name description width height images(sizes: [33, 36]) { size url id } } } totalCount pageInfo { endCursor hasNextPage } }}'
|
|
||||||
])
|
|
||||||
);
|
|
||||||
}catch(Exception $error){
|
|
||||||
|
|
||||||
throw new Exception("Failed to fetch graphQL object");
|
|
||||||
}
|
|
||||||
|
|
||||||
$json = json_decode($json, true);
|
|
||||||
|
|
||||||
if($json === null){
|
|
||||||
|
|
||||||
throw new Exception("Failed to decode graphQL object");
|
|
||||||
}
|
|
||||||
|
|
||||||
if(isset($json["errors"][0]["message"])){
|
|
||||||
|
|
||||||
throw new Exception("500px returned an API error: " . $json["errors"][0]["message"]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!isset($json["data"]["photoSearch"]["edges"])){
|
|
||||||
|
|
||||||
throw new Exception("No edges returned by API");
|
|
||||||
}
|
|
||||||
|
|
||||||
$out = [
|
|
||||||
"status" => "ok",
|
|
||||||
"npt" => null,
|
|
||||||
"image" => []
|
|
||||||
];
|
|
||||||
|
|
||||||
foreach($json["data"]["photoSearch"]["edges"] as $image){
|
|
||||||
|
|
||||||
$image = $image["node"];
|
|
||||||
$title =
|
|
||||||
trim(
|
|
||||||
$this->fuckhtml
|
|
||||||
->getTextContent(
|
|
||||||
$image["name"]
|
|
||||||
) . ": " .
|
|
||||||
$this->fuckhtml
|
|
||||||
->getTextContent(
|
|
||||||
$image["description"]
|
|
||||||
)
|
|
||||||
, " :"
|
|
||||||
);
|
|
||||||
|
|
||||||
$small = $this->image_ratio(600, $image["width"], $image["height"]);
|
|
||||||
$large = $this->image_ratio(2048, $image["width"], $image["height"]);
|
|
||||||
|
|
||||||
$out["image"][] = [
|
|
||||||
"title" => $title,
|
|
||||||
"source" => [
|
|
||||||
[
|
|
||||||
"url" => $image["images"][1]["url"],
|
|
||||||
"width" => $large[0],
|
|
||||||
"height" => $large[1]
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"url" => $image["images"][0]["url"],
|
|
||||||
"width" => $small[0],
|
|
||||||
"height" => $small[1]
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"url" => "https://500px.com" . $image["canonicalPath"]
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
// get NPT token
|
|
||||||
if($json["data"]["photoSearch"]["pageInfo"]["hasNextPage"] === true){
|
|
||||||
|
|
||||||
$out["npt"] =
|
|
||||||
$this->backend->store(
|
|
||||||
json_encode([
|
|
||||||
"cursor" => $json["data"]["photoSearch"]["pageInfo"]["endCursor"],
|
|
||||||
"search" => $search,
|
|
||||||
"sort" => $pagination["sort"],
|
|
||||||
"filters" => [],
|
|
||||||
"nlp" => false
|
|
||||||
]),
|
|
||||||
"images",
|
|
||||||
$proxy
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $out;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function image_ratio($longest_edge, $width, $height){
|
|
||||||
|
|
||||||
$ratio = [
|
|
||||||
$longest_edge / $width,
|
|
||||||
$longest_edge / $height
|
|
||||||
];
|
|
||||||
|
|
||||||
if($ratio[0] < $ratio[1]){
|
|
||||||
|
|
||||||
$ratio = $ratio[0];
|
|
||||||
}else{
|
|
||||||
|
|
||||||
$ratio = $ratio[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
return [
|
|
||||||
floor($width * $ratio),
|
|
||||||
floor($height * $ratio)
|
|
||||||
];
|
|
||||||
}
|
|
||||||
}
|
|
3589
scraper/google.php
3589
scraper/google.php
File diff suppressed because it is too large
Load Diff
257
scraper/vsco.php
257
scraper/vsco.php
|
@ -1,257 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
class vsco{
|
|
||||||
|
|
||||||
public function __construct(){
|
|
||||||
|
|
||||||
include "lib/backend.php";
|
|
||||||
$this->backend = new backend("vsco");
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getfilters($page){
|
|
||||||
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
private function get($proxy, $url, $get = [], $bearer = null){
|
|
||||||
|
|
||||||
$curlproc = curl_init();
|
|
||||||
|
|
||||||
if($get !== []){
|
|
||||||
$get_tmp = http_build_query($get);
|
|
||||||
$url .= "?" . $get_tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
|
||||||
|
|
||||||
if($bearer === null){
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
|
||||||
["User-Agent: " . config::USER_AGENT,
|
|
||||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
|
||||||
"Accept-Encoding: gzip",
|
|
||||||
"DNT: 1",
|
|
||||||
"Sec-GPC: 1",
|
|
||||||
"Connection: keep-alive",
|
|
||||||
"Upgrade-Insecure-Requests: 1",
|
|
||||||
"Sec-Fetch-Dest: document",
|
|
||||||
"Sec-Fetch-Mode: navigate",
|
|
||||||
"Sec-Fetch-Site: same-origin",
|
|
||||||
"Sec-Fetch-User: ?1",
|
|
||||||
"Priority: u=0, i",
|
|
||||||
"TE: trailers"]
|
|
||||||
);
|
|
||||||
}else{
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
|
||||||
["User-Agent: " . config::USER_AGENT,
|
|
||||||
"Accept: */*",
|
|
||||||
"Accept-Language: en-US",
|
|
||||||
"Accept-Encoding: gzip",
|
|
||||||
"Referer: https://vsco.co/search/images/" . urlencode($get["query"]),
|
|
||||||
"authorization: Bearer " . $bearer,
|
|
||||||
"content-type: application/json",
|
|
||||||
"x-client-build: 1",
|
|
||||||
"x-client-platform: web",
|
|
||||||
"DNT: 1",
|
|
||||||
"Sec-GPC: 1",
|
|
||||||
"Connection: keep-alive",
|
|
||||||
"Sec-Fetch-Dest: empty",
|
|
||||||
"Sec-Fetch-Mode: cors",
|
|
||||||
"Sec-Fetch-Site: same-origin",
|
|
||||||
"Priority: u=0",
|
|
||||||
"TE: trailers"]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
|
||||||
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
|
||||||
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
|
||||||
|
|
||||||
// http2 bypass
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
|
||||||
|
|
||||||
$this->backend->assign_proxy($curlproc, $proxy);
|
|
||||||
|
|
||||||
$data = curl_exec($curlproc);
|
|
||||||
|
|
||||||
if(curl_errno($curlproc)){
|
|
||||||
|
|
||||||
throw new Exception(curl_error($curlproc));
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_close($curlproc);
|
|
||||||
return $data;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function image($get){
|
|
||||||
|
|
||||||
if($get["npt"]){
|
|
||||||
|
|
||||||
[$data, $proxy] =
|
|
||||||
$this->backend->get(
|
|
||||||
$get["npt"], "images"
|
|
||||||
);
|
|
||||||
|
|
||||||
$data = json_decode($data, true);
|
|
||||||
|
|
||||||
}else{
|
|
||||||
|
|
||||||
$search = $get["s"];
|
|
||||||
if(strlen($search) === 0){
|
|
||||||
|
|
||||||
throw new Exception("Search term is empty!");
|
|
||||||
}
|
|
||||||
|
|
||||||
$proxy = $this->backend->get_ip();
|
|
||||||
|
|
||||||
// get bearer token
|
|
||||||
try{
|
|
||||||
|
|
||||||
$html =
|
|
||||||
$this->get(
|
|
||||||
$proxy,
|
|
||||||
"https://vsco.co/feed"
|
|
||||||
);
|
|
||||||
|
|
||||||
}catch(Exception $error){
|
|
||||||
|
|
||||||
throw new Exception("Failed to fetch feed page");
|
|
||||||
}
|
|
||||||
|
|
||||||
preg_match(
|
|
||||||
'/"tkn":"([A-z0-9]+)"/',
|
|
||||||
$html,
|
|
||||||
$bearer
|
|
||||||
);
|
|
||||||
|
|
||||||
if(!isset($bearer[1])){
|
|
||||||
|
|
||||||
throw new Exception("Failed to grep bearer token");
|
|
||||||
}
|
|
||||||
|
|
||||||
$data = [
|
|
||||||
"pagination" => [
|
|
||||||
"query" => $search,
|
|
||||||
"page" => 0,
|
|
||||||
"size" => 100
|
|
||||||
],
|
|
||||||
"bearer" => $bearer[1]
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
try{
|
|
||||||
|
|
||||||
$json =
|
|
||||||
$this->get(
|
|
||||||
$proxy,
|
|
||||||
"https://vsco.co/api/2.0/search/images",
|
|
||||||
$data["pagination"],
|
|
||||||
$data["bearer"]
|
|
||||||
);
|
|
||||||
}catch(Exception $error){
|
|
||||||
|
|
||||||
throw new Exception("Failed to fetch JSON");
|
|
||||||
}
|
|
||||||
|
|
||||||
$json = json_decode($json, true);
|
|
||||||
|
|
||||||
if($json === null){
|
|
||||||
|
|
||||||
throw new Exception("Failed to decode JSON");
|
|
||||||
}
|
|
||||||
|
|
||||||
$out = [
|
|
||||||
"status" => "ok",
|
|
||||||
"npt" => null,
|
|
||||||
"image" => []
|
|
||||||
];
|
|
||||||
|
|
||||||
if(!isset($json["results"])){
|
|
||||||
|
|
||||||
throw new Exception("Failed to access results object");
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach($json["results"] as $image){
|
|
||||||
|
|
||||||
$image_domain = parse_url("https://" . $image["responsive_url"], PHP_URL_HOST);
|
|
||||||
$thumbnail = explode($image_domain, $image["responsive_url"], 2)[1];
|
|
||||||
|
|
||||||
if(substr($thumbnail, 0, 3) != "/1/"){
|
|
||||||
|
|
||||||
$thumbnail =
|
|
||||||
preg_replace(
|
|
||||||
'/^\/[^\/]+/',
|
|
||||||
"",
|
|
||||||
$thumbnail
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
$thumbnail = "https://img.vsco.co/cdn-cgi/image/width=480,height=360" . $thumbnail;
|
|
||||||
$size =
|
|
||||||
$this->image_ratio(
|
|
||||||
(int)$image["dimensions"]["width"],
|
|
||||||
(int)$image["dimensions"]["height"]
|
|
||||||
);
|
|
||||||
|
|
||||||
$out["image"][] = [
|
|
||||||
"title" => $image["description"],
|
|
||||||
"source" => [
|
|
||||||
[
|
|
||||||
"url" => "https://" . $image["responsive_url"],
|
|
||||||
"width" => (int)$image["dimensions"]["width"],
|
|
||||||
"height" => (int)$image["dimensions"]["height"]
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"url" => $thumbnail,
|
|
||||||
"width" => $size[0],
|
|
||||||
"height" => $size[1]
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"url" => "https://" . $image["grid"]["domain"] . "/media/" . $image["imageId"]
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
// get NPT
|
|
||||||
$max_page = ceil($json["total"] / 100);
|
|
||||||
$data["pagination"]["page"]++;
|
|
||||||
|
|
||||||
if($max_page > $data["pagination"]["page"]){
|
|
||||||
|
|
||||||
$out["npt"] =
|
|
||||||
$this->backend->store(
|
|
||||||
json_encode($data),
|
|
||||||
"images",
|
|
||||||
$proxy
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $out;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function image_ratio($width, $height){
|
|
||||||
|
|
||||||
$ratio = [
|
|
||||||
480 / $width,
|
|
||||||
360 / $height
|
|
||||||
];
|
|
||||||
|
|
||||||
if($ratio[0] < $ratio[1]){
|
|
||||||
|
|
||||||
$ratio = $ratio[0];
|
|
||||||
}else{
|
|
||||||
|
|
||||||
$ratio = $ratio[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
return [
|
|
||||||
floor($width * $ratio),
|
|
||||||
floor($height * $ratio)
|
|
||||||
];
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -231,14 +231,6 @@ $settings = [
|
||||||
"value" => "pinterest",
|
"value" => "pinterest",
|
||||||
"text" => "Pinterest"
|
"text" => "Pinterest"
|
||||||
],
|
],
|
||||||
[
|
|
||||||
"value" => "fivehpx",
|
|
||||||
"text" => "500px"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"value" => "vsco",
|
|
||||||
"text" => "VSCO"
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
"value" => "imgur",
|
"value" => "imgur",
|
||||||
"text" => "Imgur"
|
"text" => "Imgur"
|
||||||
|
|
Loading…
Reference in New Issue