From 785452873f0ee0a27fc157b482b7551560f0282d Mon Sep 17 00:00:00 2001 From: lolcat Date: Tue, 7 Nov 2023 08:04:56 -0500 Subject: [PATCH] fix typo --- README.md | 132 ++++----- about.php | 129 +-------- ami4get.php | 27 ++ api.txt | 5 + api/v1/ac.php | 93 ++++--- api/v1/images.php | 8 +- api/v1/music.php | 8 +- api/v1/news.php | 8 +- api/v1/videos.php | 8 +- api/v1/web.php | 16 +- audio.php | 1 + audio_sc.php | 1 + data/config.php | 103 +++++++ data/instances.php | 62 ----- data/proxies/.gitignore | 3 + data/proxies/onion.txt | 13 + favicon.php | 1 + images.php | 17 +- index.php | 3 +- instances.php | 55 ++++ lib/backend.php | 197 +++++++++++++ lib/captcha_gen.php | 32 ++- lib/curlproxy.php | 8 +- lib/frontend.php | 143 ++++------ lib/fuckhtml.php | 2 - lib/nextpage.php | 106 ------- music.php | 17 +- news.php | 17 +- opensearch.php | 29 ++ proxy.php | 1 + scraper/brave.php | 338 +++++++++++++++++----- scraper/ddg.php | 388 ++++++++++++++------------ scraper/facebook.php | 5 + scraper/ftm.php | 43 ++- scraper/google.php | 84 +++--- scraper/imgur.php | 37 ++- scraper/marginalia.php | 17 +- scraper/mojeek.php | 441 ++++++++++++++--------------- scraper/pinterest.php | 5 + scraper/sc.php | 53 ++-- scraper/wiby.php | 26 +- scraper/yandex.php | 85 ++++-- scraper/yep.php | 16 +- scraper/youtube.php | 37 ++- settings.php | 121 +++++--- sitemap.php | 35 +++ static/client.js | 32 ++- static/serverping.js | 495 +++++++++++++++++++++++++++++++++ static/style.css | 140 ++++++++-- static/themes/Cream.css | 31 +++ template/about.html | 77 +++++ template/header.html | 9 +- template/header_nofilters.html | 14 + template/home.html | 21 +- template/images.html | 2 +- template/instances.html | 36 +++ template/search.html | 2 +- videos.php | 17 +- web.php | 17 +- 59 files changed, 2592 insertions(+), 1277 deletions(-) create mode 100644 ami4get.php create mode 100644 data/config.php delete mode 100644 data/instances.php create mode 100644 data/proxies/.gitignore create mode 100644 data/proxies/onion.txt create mode 100644 instances.php create mode 100644 lib/backend.php delete mode 100644 lib/nextpage.php create mode 100644 opensearch.php create mode 100644 sitemap.php create mode 100644 static/serverping.js create mode 100644 static/themes/Cream.css create mode 100644 template/about.html create mode 100644 template/header_nofilters.html create mode 100644 template/instances.html diff --git a/README.md b/README.md index 70c475e..f81ea98 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,21 @@ # 4get 4get is a metasearch engine that doesn't suck (they live in our walls!) -## About 4get +# About 4get https://4get.ca/about -## Try it out +# Try it out https://4get.ca +# Totally unbiased comparison between alternatives + +| | 4get | searx(ng) | librex | araa | +|----------------------------|-------------------------|-----------|-------------|----------| +| RAM usage | 200-400mb~ | 2GB~ | 200-400mb~ | 2GB~ | +| Does it suck | no (debunked by snopes) | yes | yes | a little | +| Does it work | ye | no | no | ye | +| Did the dev commit suicide | not until my 30s | idk | yes | no | + ## Supported websites 1. Web - DuckDuckGo @@ -36,7 +45,6 @@ https://4get.ca 4. News - DuckDuckGo - Brave - - Google - Mojeek 5. Music @@ -55,15 +63,15 @@ https://4get.ca More scrapers are coming soon. I currently want to add Google web/video/news search, HackerNews (durr orange site!!) and Qwant. A shopping and files tab is also in my todo list. -# Setup +# Installation This section is still to-do. You will need to figure shit out for some of the apache2 and nginx stuff. Everything else should be OK. -## Apache +## Install on Apache Login as root. ```sh -apt install apache2 certbot php-dom php-imagick imagemagick php-curl curl php-apcu git libapache2-mod-php python3-certbot-apache +apt install apache2 certbot php-imagick imagemagick php-curl curl php-apcu git libapache2-mod-php python3-certbot-apache service apache2 start a2enmod rewrite ``` @@ -90,7 +98,7 @@ chmod 777 -R icons/ Restart the service for good measure... `service apache2 restart` -## NGINX +## Install on NGINX Login as root. @@ -138,10 +146,54 @@ ln -s /etc/nginx/sites-available/4get.conf /etc/nginx/sites-available/4get.conf Now test the nginx config with `nginx -t`, if it says that everything is good, restart nginx using `systemctl restart nginx` -## Setup encryption +## Install using Docker (lol u lazy fuck) + +``` +docker run -d -p 80:80 -e FOURGET_SERVER_NAME="4get.ca" -e FOURGET_SERVER_ADMIN_EMAIL="you@example.com" luuul/4get:latest +``` + +...Or with SSL: +``` +docker run -d -p 443:443 -e FOURGET_SERVER_NAME="4get.ca" -e FOURGET_SERVER_ADMIN_EMAIL="you@example.com" -v /etc/letsencrypt/live/domain.tld:/etc/4get/certs luuul/4get:latest +``` + +replace enviroment variables FOURGET_SERVER_NAME and FOURGET_SERVER_ADMIN_EMAIL with relevant values + +if the certificate files are not mounted to /etc/4get/certs the service listens to port 80 +the certificate directory expects files named `cert.pem`, `chain.pem`, `privkey.pem` + +## Install using Docker Compose +copy `docker-compose.yaml` + +create a directory with images named `banners` for example and mount to `/var/www/html/4get/banner` +to serve custom banners + +``` +version: "3.7" + +services: + fourget: + image: luuul/4get:latest + restart: always + environment: + - FOURGET_SERVER_NAME=4get.ca + - FOURGET_SERVER_ADMIN_EMAIL="you@example.com" + + ports: + - "80:80" + - "443:443" + + volumes: + - /etc/letsencrypt/live/domain.tld:/etc/4get/certs + - ./banners:/var/www/html/4get/banner +``` + +Replace relevant values and start with `docker-compose up -d` + +# Encryption setup I'm schizoid (as you should) so I'm gonna setup 4096bit key encryption. To complete this step, you need a domain or subdomain in your possession. Make sure that the DNS shit for your domain has propagated properly before continuing, because certbot is a piece of shit that will error out the ass once you reach 5 attempts under an hour. -### Apache +## Encryption setup on Apache ```sh certbot --apache --rsa-key-size 4096 -d www.yourdomain.com -d yourdomain.com @@ -169,7 +221,7 @@ Restart again service apache2 restart ``` -### NGINX +## Encryption setup on NGINX Generate a certificate for the domain using: @@ -180,15 +232,13 @@ certbot --nginx --key-type ecdsa -d www.yourdomain.com -d yourdomain.com After doing that certbot should deploy the certificate automatically into your 4get nginx config file. It should be ready to use at that point. -## Captcha +# Jesse it is time to configure the server the fucking bots are back -Right now the setup for this shit is absolutely awful. +Wohoo the awful piece of shit setup and fiddling with 3 gazillion files is GONE. All you need to do to configure your shit is to go in `data/config.php` and edit the self-documenting configuration file. You can also specify proxies in `data/proxies/whatever.txt` and captcha images in `data/captcha/category/1.png`... I further explain how to deal with that garbage in the config file I mentionned. -Edit line 190 in `lib/captcha_gen.php` and specify your image sets. You can't disable the captcha right now lol. Just use a previous commit if you want to do that. Call me a shitcoder all you want I've had no energy lately. Images must be stored in `data/captcha`. Create a folder for each category. All files in there should be named from `1.png` to `321839.png`, for example. +# (Optional) Tor setup -## Tor Setup - -1. Install tor. +1. Install `tor`. 2. Open `/etc/tor/torrc` 3. Go to the line that contains `HiddenServiceDir` and `HiddenServicePort` 4. Uncomment those 2 lines and set them like this: @@ -205,7 +255,7 @@ After you get your onion address you will need to configure your Apache or Nginx I don't know to configure this shit on Apache so here is the NGINX one. -### NGINX +## Tor setup on NGINX Open your current 4get NGINX config (that is under `/etc/nginx/sites-available/`) and append this to the end of the file: @@ -240,49 +290,5 @@ server { Obviously replace `` by the onion address of `/var/lib/tor/4get/hostname` and then check if the nginx config is valid with `nginx -t` if yes, then restart the nginx service and try opening the onion address into the Tor Browser. You can see a real world example [here](https://git.zzls.xyz/Fijxu/etc-configs/src/branch/selfhost/nginx/sites-available/4get.zzls.xyz.conf) -## Docker Install - - -``` -docker run -d -p 80:80 -e FOURGET_SERVER_NAME="4get.ca" -e FOURGET_SERVER_ADMIN_EMAIL="you@example.com" luuul/4get:latest -``` - -With SSL -``` -docker run -d -p 443:443 -e FOURGET_SERVER_NAME="4get.ca" -e FOURGET_SERVER_ADMIN_EMAIL="you@example.com" -v /etc/letsencrypt/live/domain.tld:/etc/4get/certs luuul/4get:latest -``` - -replace enviroment variables FOURGET_SERVER_NAME and FOURGET_SERVER_ADMIN_EMAIL with relevant values - -if the certificate files are not mounted to /etc/4get/certs the service listens to port 80 -the certificate directory expects files named `cert.pem`, `chain.pem`, `privkey.pem` - -## Docker compose - -copy `docker-compose.yaml` - -create a directory with images named `banners` for example and mount to `/var/www/html/4get/banner` -to serve custom banners - -``` -version: "3.7" - -services: - fourget: - image: luuul/4get:latest - restart: always - environment: - - FOURGET_SERVER_NAME=4get.ca - - FOURGET_SERVER_ADMIN_EMAIL="you@example.com" - - ports: - - "80:80" - - "443:443" - - volumes: - - /etc/letsencrypt/live/domain.tld:/etc/4get/certs - - ./banners:/var/www/html/4get/banner -``` - -Replace relevant values and start with `docker-compose up -d` - +# Contact +shit breaks all the time but I repair it all the time too. Email me here: willlolcat(dot)ca diff --git a/about.php b/about.php index 385d313..939705b 100644 --- a/about.php +++ b/about.php @@ -1,128 +1,23 @@ ' . - '' . - '' . - '' . - 'About' . - '' . - '' . - '' . - '' . - '' . - '' . - '' . - ''; - -include "data/instances.php"; -$compiledinstancelist = ""; -foreach ($instancelist as $instance) -{ - $compiledinstancelist .= " ".$instance["name"].""; - $compiledinstancelist .= " ".$instance["address"]["displayname"].""; - foreach ($instance["altaddresses"] as $alt) - { - $compiledinstancelist .= "(".$alt["displayname"].")"; - } - $compiledinstancelist .= ""; -} + $frontend->load( + "header_nofilters.html", + [ + "title" => "About", + "class" => " class=\"about\"" + ] + ); $left = - '< Go back - -

Set as default search engine

-

On Firefox and other Gecko based browsers

- To set this as your default search engine on Firefox, right click the URL bar and select
Add "4get"
. Then, visit about:preferences#search and select
4get
in the dropdown menu. - -

On Chromium and Blink based browsers

- Click the 3 superpositioned dots at the top right of the screen and click on
Settings
, then search for
default search engine
, or visit chrome://settings/searchEngines.

- - Once you\'re there, click the pencil on the last entry under "Search engines" (it\'s probably DuckDuckGo). Once you do that, a popup will appear. Populate it with the following information: - - - - - - - - - - - - - - - - - - -
FieldValue
Search engine4get
Shortcut4get
URL with %s in place of queryhttps://4get.ca/web?s=%s
- - Once that\'s done, click
Save
. Then, on the right handside of the newly created entry, open the dropdown menu and select
Make default
. - -

Frequently asked questions

-

What is this?

- This is a metasearch engine that gets results from other engines, and strips away all of the tracking parameters and Microsoft/globohomo bullshit they add. Most of the other alternatives to Google jack themselves off about being ""privacy respecting"" or whatever the fuck but it always turns out to be a total lie, and I just got fed up with their shit honestly. Alternatives like Searx or YaCy all fucking sucks so I made my own thing. - -

My goal

- Provide users with a privacy oriented, extremely lightweight, ad free, free as in freedom (and free beer!) way to search for documents around the internet, with minimal, optional javascript code. My long term goal would be to build my own index (that doesn\'t suck) and provide users with an unbiased search engine, with no political inclinations. - -

Do you keep logs?

- I store data temporarly to get the next page of results. This might include search queries, tokens and other parameters. These parameters are encrypted using
aes-256-gcm
on the serber, for which I give you a key (also known internally as
npt
token). When you make a request to get the next page, you supply the token, the data is decrypted and the request is fulfilled. This encrypted data is deleted after 15 minutes, or after it\'s used, whichever comes first.

- - I don\'t log IP addresses, user agents, or anything else. The
npt
tokens are the only thing that are stored (in RAM, mind you), temporarly, encrypted. - -

Do you share information with third parties?

- Your search queries and supplied filters are shared with the scraper you chose (so I can get the search results, duh). I don\'t share anything else (that means I don\'t share your IP address, location, or anything of this kind). There is no way that site can know you\'re the one searching for something, unless you send out a search query that de-anonymises you. For example, a search query like "hello my full legal name is jonathan gallindo and i want pictures of cloacas" would definitively blow your cover. 4get doesn\'t contain ads or any third party javascript applets or trackers. I don\'t profile you, and quite frankly, I don\'t give a shit about what you search on there.

- - TL;DR assume those websites can see what you search for, but can\'t see who you are (unless you\'re really dumb). - -

Where is this website hosted?

- This website is hosted on a Contabo shitbox in the United States. - -

Keyboard shortcuts?

- Use
/
to focus the search box.

- - When the image viewer is open, you can use the following keybinds:
-
Up
,
Down
,
Left
,
Right
to rotate the image.
-
CTRL+Up
,
CTRL+Down
,
CTRL+Left
,
CTRL+Right
to mirror the image.
-
Escape
to exit the image viewer. - -

Instances

- 4get is open source, anyone can create their own 4get instance! If you wish to add your website to this list, please contact me. - - - - - - - '.$compiledinstancelist.' -
NameAddress
- -

How can I trust you?

- You just sort of have to take my word for it right now. If you\'d rather trust yourself instead of me (I believe in you!!), all of the code on this website is available trough my git page for you to host on your own machines. Just a reminder: if you\'re the sole user of your instance, it doesn\'t take immense brain power for Microshit to figure out you basically just switched IP addresses. Invite your friends to use your instance! - - - Donate to me trough ko-fi: ko-fi.com/lolcat
- Please donate I sent myself a donation for testing if it works and it looks fucking dumb. Reasons to donate are listed on there. Thank you! - -

I want to report abuse or have erotic roleplay trough email

- I don\'t know about that second part but if you want to talk to me, just drop me an email...

- - Message to all DMCA enforcers: I don\'t host any of the content. Everything you see here is proxied trough my shitbox with no moderation. Please reach out to the people hosting the infringing content instead.

- - Click here to contact me!

- - - Valid W3C HTML 4.01 - '; - -// trim out whitespace -$left = explode("\n", $left); + explode( + "\n", + file_get_contents("template/about.html") + ); $out = ""; diff --git a/ami4get.php b/ami4get.php new file mode 100644 index 0000000..f2d48bf --- /dev/null +++ b/ami4get.php @@ -0,0 +1,27 @@ + "ok", + "service" => "4get", + "server" => [ + "name" => config::SERVER_NAME, + "description" => config::SERVER_LONG_DESCRIPTION, + "bot_protection" => config::BOT_PROTECTION, + "real_requests" => $real_requests === false ? 0 : $real_requests, + "bot_requests" => $bot_requests === false ? 0 : $bot_requests, + "api_enabled" => config::API_ENABLED, + "alt_addresses" => config::ALT_ADDRESSES, + "version" => config::VERSION + ], + "instances" => config::INSTANCES + ] +); diff --git a/api.txt b/api.txt index f3c8b17..70e179c 100644 --- a/api.txt +++ b/api.txt @@ -119,6 +119,11 @@ /_____/_/ /_/\__,_/ .___/\____/_/_/ /_/\__/____/ /_/ ++ /ami4get + Tells you basic information about the 4get instance. CORS requests + are allowed on this endpoint. + + + /api/v1/web + &extendedsearch When using the ddg(DuckDuckGo) scraper, you may make use of the diff --git a/api/v1/ac.php b/api/v1/ac.php index 3ee1481..b1ec7dd 100644 --- a/api/v1/ac.php +++ b/api/v1/ac.php @@ -1,5 +1,6 @@ "https://api.yep.com/ac/?query={searchTerms}", "marginalia" => "https://search.marginalia.nu/suggest/?partial={searchTerms}", "yt" => "https://suggestqueries-clients6.youtube.com/complete/search?client=youtube&q={searchTerms}", - "sc" => "https://api-v2.soundcloud.com/search/queries?q={searchTerms}&client_id=ArYppSEotE3YiXCO4Nsgid2LLqJutiww&limit=10&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en" + "sc" => "https://api-v2.soundcloud.com/search/queries?q={searchTerms}&client_id=" . config::SC_CLIENT_TOKEN . "&limit=10&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en" ]; /* @@ -107,7 +108,8 @@ class autocomplete{ [ $_GET["s"], $json - ] + ], + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES ); break; @@ -132,7 +134,8 @@ class autocomplete{ [ $_GET["s"], $json - ] + ], + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES ); break; @@ -150,7 +153,8 @@ class autocomplete{ [ $_GET["s"], $json - ] + ], + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES ); break; @@ -162,7 +166,8 @@ class autocomplete{ [ $_GET["s"], $json[1] // ensure it contains valid key 0 - ] + ], + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES ); break; } @@ -170,45 +175,54 @@ class autocomplete{ private function get($url, $query){ - $curlproc = curl_init(); - - $url = str_replace("{searchTerms}", urlencode($query), $url); - - curl_setopt($curlproc, CURLOPT_URL, $url); - - curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding - curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0", - "Accept: application/json, text/javascript, */*; q=0.01", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "DNT: 1", - "Connection: keep-alive", - "Sec-Fetch-Dest: empty", - "Sec-Fetch-Mode: cors", - "Sec-Fetch-Site: same-site"] - ); - - curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); - curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); - curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); - - $data = curl_exec($curlproc); - - if(curl_errno($curlproc)){ + try{ + $curlproc = curl_init(); - throw new Exception(curl_error($curlproc)); - } + $url = str_replace("{searchTerms}", urlencode($query), $url); + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0", + "Accept: application/json, text/javascript, */*; q=0.01", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; - curl_close($curlproc); - return $data; + }catch(Exception $error){ + + do404("Curl error: " . $error->getMessage()); + } } private function do404($error){ - echo json_encode(["error" => $error]); + echo json_encode( + ["error" => $error], + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + ); die(); } @@ -218,7 +232,8 @@ class autocomplete{ [ $_GET["s"], [] - ] + ], + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES ); die(); } diff --git a/api/v1/images.php b/api/v1/images.php index 34510b4..3072b05 100644 --- a/api/v1/images.php +++ b/api/v1/images.php @@ -1,8 +1,14 @@ "The server administrator disabled the API!"]); + return; +} include "lib/frontend.php"; $frontend = new frontend(); diff --git a/api/v1/music.php b/api/v1/music.php index 3c30953..409e6f0 100644 --- a/api/v1/music.php +++ b/api/v1/music.php @@ -1,8 +1,14 @@ "The server administrator disabled the API!"]); + return; +} include "lib/frontend.php"; $frontend = new frontend(); diff --git a/api/v1/news.php b/api/v1/news.php index bd8678f..ddfd72a 100644 --- a/api/v1/news.php +++ b/api/v1/news.php @@ -1,8 +1,14 @@ "The server administrator disabled the API!"]); + return; +} include "lib/frontend.php"; $frontend = new frontend(); diff --git a/api/v1/videos.php b/api/v1/videos.php index a42b29b..dab29af 100644 --- a/api/v1/videos.php +++ b/api/v1/videos.php @@ -1,8 +1,14 @@ "The server administrator disabled the API!"]); + return; +} include "lib/frontend.php"; $frontend = new frontend(); diff --git a/api/v1/web.php b/api/v1/web.php index 61bf82a..dc1a7cc 100644 --- a/api/v1/web.php +++ b/api/v1/web.php @@ -1,8 +1,14 @@ "The server administrator disabled the API!"]); + return; +} include "lib/frontend.php"; $frontend = new frontend(); @@ -21,7 +27,13 @@ new captcha($null, $null, $null, "web", false); $get = $frontend->parsegetfilters($_GET, $filters); -if(!isset($_GET["extendedsearch"])){ +if( + isset($_GET["extendedsearch"]) && + $_GET["extendedsearch"] == "yes" +){ + + $get["extendedsearch"] = "yes"; +}else{ $get["extendedsearch"] = "no"; } diff --git a/audio.php b/audio.php index bb018da..fac4d7f 100644 --- a/audio.php +++ b/audio.php @@ -7,6 +7,7 @@ if(!isset($_GET["s"])){ die(); } +include "data/config.php"; include "lib/curlproxy.php"; $proxy = new proxy(); diff --git a/audio_sc.php b/audio_sc.php index 9a227e3..36a6855 100644 --- a/audio_sc.php +++ b/audio_sc.php @@ -1,5 +1,6 @@ tag on home page + const SERVER_SHORT_DESCRIPTION = "They live in our walls!"; + + // Will be shown in server list ping (null for no description) + const SERVER_LONG_DESCRIPTION = null; + + // Add your own themes in "static/themes". Set to "Dark" for default theme. + // Eg. To use "static/themes/Cream.css", specify "Cream". + const DEFAULT_THEME = "Dark"; + + // Enable the API? + const API_ENABLED = true; + + // Bot protection + // 4get.ca has been hit with 250k bot reqs every single day for months + // you probably want to enable this if your instance is public... + // 0 = disabled + // 1 = ask for image captcha (requires image dataset & imagick 6.9.11-60) + // @TODO: 2 = invite only (users needs a pass) + const BOT_PROTECTION = 0; + + // if BOT_PROTECTION is set to 1, specify the available datasets here + // images should be named from 1.png to X.png, and be 100x100 in size + // Eg. data/captcha/birds/1.png up to 2263.png + const CAPTCHA_DATASET = [ + // example: + // ["birds", 2263], + // ["fumo_plushies", 1006], + // ["minecraft", 848] + ]; + + // List of domains that point to your servers. Include your tor/i2p + // addresses here! Must be a valid URL. Won't affect links placed on + // the homepage. + const ALT_ADDRESSES = [ + //"https://4get.alt-tld", + //"http://4getwebfrq5zr4sxugk6htxvawqehxtdgjrbcn2oslllcol2vepa23yd.onion" + ]; + + // Known 4get instances. MUST use the https protocol if your instance uses + // it. Is used to generate a distributed list of instances. + // To appear in the list of an instance, contact the host and if everyone added + // eachother your serber should appear everywhere. + const INSTANCES = [ + "https://4get.ca", + "https://4get.zzls.xyz", + "https://4get.silly.computer", + "https://4g.opnxng.com", + "https://4get.konakona.moe" + ]; + + // Default user agent to use for scraper requests. Sometimes ignored to get specific webpages + // Changing this might break things. + const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0"; + + // Proxy pool assignments for each scraper + // false = Use server's raw IP + // string = will load a proxy list from data/proxies + // Eg. "onion" will load data/proxies/onion.txt + const PROXY_DDG = false; // duckduckgo + const PROXY_BRAVE = false; + const PROXY_FB = false; // facebook + const PROXY_GOOGLE = false; + const PROXY_MARGINALIA = false; + const PROXY_MOJEEK = false; + const PROXY_SC = false; // soundcloud + const PROXY_WIBY = false; + const PROXY_YT = false; // youtube + const PROXY_YEP = false; + const PROXY_PINTEREST = false; + const PROXY_FTM = false; // findthatmeme + const PROXY_IMGUR = false; + const PROXY_YANDEX_W = false; // yandex web + const PROXY_YANDEX_I = false; // yandex images + const PROXY_YANDEX_V = false; // yandex videos + + // + // Scraper-specific parameters + // + + // SOUNDCLOUD + // Get these parameters by making a search on soundcloud with network + // tab open, then filter URLs using "search?q=". (No need to login) + const SC_USER_ID = "143860-454480-469473-289775"; + const SC_CLIENT_TOKEN = "qwfvRfz8PCoa2NldZALK7hhZFIH24Wyx"; + + // MARGINALIA + // Get an API key by contacting the Marginalia.nu maintainer. The "public" key + // works but is almost always rate-limited. + const MARGINALIA_API_KEY = "public"; +} diff --git a/data/instances.php b/data/instances.php deleted file mode 100644 index d7c26e0..0000000 --- a/data/instances.php +++ /dev/null @@ -1,62 +0,0 @@ - "lolcat's instance (master)", - "address" => [ - "uri" => "https://4get.ca/", - "displayname" => "4get.ca" - ], - "altaddresses" => [ - [ - // all these address blocks will be linked in parentheses - // e.g. 4get.ca (tor) (i2p) etc. - "uri" => "http://4getwebfrq5zr4sxugk6htxvawqehxtdgjrbcn2oslllcol2vepa23yd.onion", - "displayname" => "tor" - ] - ] - ], - [ - "name" => "zzls's Chilean instance", - "address" => [ - "uri" => "https://4get.zzls.xyz/", - "displayname" => "4get.zzls.xyz" - ], - "altaddresses" => [ - [ - "uri" => "http://4get.zzlsghu6mvvwyy75mvga6gaf4znbp3erk5xwfzedb4gg6qqh2j6rlvid.onion", - "displayname" => "tor" - ] - ] - ], - [ - "name" => "zzls's United States instance", - "address" => [ - "uri" => "https://4getus.zzls.xyz/", - "displayname" => "4getus.zzls.xyz" - ], - "altaddresses" => [ - [ - "uri" => "http://4getus.zzlsghu6mvvwyy75mvga6gaf4znbp3erk5xwfzedb4gg6qqh2j6rlvid.onion", - "displayname" => "tor" - ] - ] - ], - [ - "name" => "4get on a silly computer", - "address" => [ - "uri" => "https://4get.silly.computer", - "displayname" => "4get.silly.computer" - ], - "altaddresses" => [ - [ - "uri" => "https://4get.cynic.moe/", - "displayname" => "fallback domain" - ] - ] - ] -] -?> diff --git a/data/proxies/.gitignore b/data/proxies/.gitignore new file mode 100644 index 0000000..70fd2c3 --- /dev/null +++ b/data/proxies/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!onion.txt \ No newline at end of file diff --git a/data/proxies/onion.txt b/data/proxies/onion.txt new file mode 100644 index 0000000..c9b03f0 --- /dev/null +++ b/data/proxies/onion.txt @@ -0,0 +1,13 @@ +# Specify proxies by following this format: +# :
::: +# +# Examples: +# https:1.3.3.7:6969:abcd:efg +# socks4:1.2.3.4:8080:: +# raw_ip:::: +# +# Available types: +# raw_ip, http, https, socks4, socks5, socks4a, socks5_hostname + +# Local tor proxy +socks5:localhost:9050:: diff --git a/favicon.php b/favicon.php index dadb923..2a31839 100644 --- a/favicon.php +++ b/favicon.php @@ -6,6 +6,7 @@ if(!isset($_GET["s"])){ die(); } +include "data/config.php"; new favicon($_GET["s"]); class favicon{ diff --git a/images.php b/images.php index 5be8de4..d9dbecf 100644 --- a/images.php +++ b/images.php @@ -3,6 +3,8 @@ /* Initialize random shit */ +include "data/config.php"; + include "lib/frontend.php"; $frontend = new frontend(); @@ -26,20 +28,7 @@ try{ }catch(Exception $error){ - echo - $frontend->drawerror( - "Shit", - 'This scraper returned an error:' . - '
' . htmlspecialchars($error->getMessage()) . '
' . - 'Things you can try:' . - '
    ' . - '
  • Use a different scraper
  • ' . - '
  • Remove keywords that could cause errors
  • ' . - '
  • Use another 4get instance
  • ' . - '

' . - 'If the error persists, please contact the administrator.' - ); - die(); + $frontend->drawscrapererror($error->getMessage(), $get, "images"); } if(count($results["image"]) === 0){ diff --git a/index.php b/index.php index be9897f..8eba2fc 100644 --- a/index.php +++ b/index.php @@ -1,5 +1,6 @@ load( "home.html", [ - "body_class" => $frontend->getthemeclass(false), + "server_short_description" => htmlspecialchars(config::SERVER_SHORT_DESCRIPTION), "banner" => $images[rand(0, count($images) - 1)] ] ); diff --git a/instances.php b/instances.php new file mode 100644 index 0000000..b9db771 --- /dev/null +++ b/instances.php @@ -0,0 +1,55 @@ + $value){ + + if( + !is_string($value) || + $key == "target" + ){ + + continue; + } + + if($first === true){ + + $first = false; + $params = "?"; + }else{ + + $params .= "&"; + } + + $params .= urlencode($key) . "=" . urlencode($value); +} + +if( + !isset($_GET["target"]) || + !is_string($_GET["target"]) +){ + + $target = ""; +}else{ + + $target = "/" . urlencode($_GET["target"]); +} + +$instances = ""; +foreach(config::INSTANCES as $instance){ + + $instances .= '' . htmlspecialchars($instance) . ''; +} + +echo + $frontend->load( + "instances.html", + [ + "instances_html" => $instances + ] + ); diff --git a/lib/backend.php b/lib/backend.php new file mode 100644 index 0000000..209cfec --- /dev/null +++ b/lib/backend.php @@ -0,0 +1,197 @@ +scraper = $scraper; + $this->requestid = apcu_inc("real_requests"); + } + + /* + Proxy stuff + */ + public function get_ip(){ + + $pool = constant("config::PROXY_" . strtoupper($this->scraper)); + if($pool === false){ + + // we don't want a proxy, fuck off! + return 'raw_ip::::'; + } + + // indent + $proxy_index_raw = apcu_inc("p." . $this->scraper); + + $proxylist = file_get_contents("data/proxies/" . $pool . ".txt"); + $proxylist = explode("\n", $proxylist); + + // ignore empty or commented lines + $proxylist = array_filter($proxylist, function($entry){ + $entry = ltrim($entry); + return strlen($entry) > 0 && substr($entry, 0, 1) != "#"; + }); + + $proxylist = array_values($proxylist); + + return $proxylist[$proxy_index_raw % count($proxylist)]; + } + + // this function is also called directly on nextpage + public function assign_proxy(&$curlproc, $ip){ + + // parse proxy line + [ + $type, + $address, + $port, + $username, + $password + ] = explode(":", $ip, 5); + + switch($type){ + + case "raw_ip": + return; + break; + + case "http": + case "https": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); + curl_setopt($curlproc, CURLOPT_PROXY, $type . "://" . $address . ":" . $port); + break; + + case "socks4": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + + case "socks5": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + + case "socks4a": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + + case "socks5_hostname": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + } + + if($username != ""){ + + curl_setopt($curlproc, CURLOPT_PROXYUSERPWD, $username . ":" . $password); + } + } + + + + /* + Next page stuff + */ + public function store($payload, $page, $proxy){ + + $page = $page[0]; + $password = random_bytes(256); // 2048 bit + $salt = random_bytes(16); + $key = hash_pbkdf2("sha512", $password, $salt, 20000, 32, true); + $iv = + random_bytes( + openssl_cipher_iv_length("aes-256-gcm") + ); + + $tag = ""; + $out = openssl_encrypt($payload, "aes-256-gcm", $key, OPENSSL_RAW_DATA, $iv, $tag, "", 16); + + $key = apcu_inc("key", 1); + + apcu_store( + $page . "." . + $this->scraper . + $this->requestid, + gzdeflate($proxy . "," . $salt.$iv.$out.$tag), + 900 // cache information for 15 minutes blaze it + ); + + return + $this->scraper . $this->requestid . "." . + rtrim(strtr(base64_encode($password), '+/', '-_'), '='); + } + + public function get($npt, $page){ + + $page = $page[0]; + $explode = explode(".", $npt, 2); + + if(count($explode) !== 2){ + + throw new Exception("Malformed nextPageToken!"); + } + + $apcu = $page . "." . $explode[0]; + $key = $explode[1]; + + $payload = apcu_fetch($apcu); + + if($payload === false){ + + throw new Exception("The nextPageToken is invalid or has expired!"); + } + + $key = + base64_decode( + str_pad( + strtr($key, '-_', '+/'), + strlen($key) % 4, + '=', + STR_PAD_RIGHT + ) + ); + + $payload = gzinflate($payload); + + // get proxy + [ + $proxy, + $payload + ] = explode(",", $payload, 2); + + $key = + hash_pbkdf2( + "sha512", + $key, + substr($payload, 0, 16), // salt + 20000, + 32, + true + ); + $ivlen = openssl_cipher_iv_length("aes-256-gcm"); + + $payload = + openssl_decrypt( + substr( + $payload, + 16 + $ivlen, + -16 + ), + "aes-256-gcm", + $key, + OPENSSL_RAW_DATA, + substr($payload, 16, $ivlen), + substr($payload, -16) + ); + + if($payload === false){ + + throw new Exception("The nextPageToken is invalid or has expired!"); + } + + // remove the key after using + apcu_delete($apcu); + + return [$payload, $proxy]; + } +} diff --git a/lib/captcha_gen.php b/lib/captcha_gen.php index 80bc665..6728747 100644 --- a/lib/captcha_gen.php +++ b/lib/captcha_gen.php @@ -4,6 +4,19 @@ class captcha{ public function __construct($frontend, $get, $filters, $page, $output){ + // check if we want captcha + if(config::BOT_PROTECTION !== 1){ + + if($output === true){ + $frontend->loadheader( + $get, + $filters, + $page + ); + } + return; + } + /* Validate cookie, if it exists */ @@ -46,6 +59,7 @@ class captcha{ if($output === false){ + http_response_code(429); // too many reqs echo json_encode([ "status" => "The \"pass\" token in your cookies is missing or has expired!!" ]); @@ -184,15 +198,6 @@ class captcha{ } } - /* - Generate random grid data to pass to captcha.php - */ - $dataset = [ - ["birds", 2263], - ["fumo_plushies", 1006], - ["minecraft", 848] - ]; - // get the positions for the answers // will return between 3 and 6 answer positions $range = range(0, 15); @@ -216,17 +221,18 @@ class captcha{ } // choose a dataset - $choosen = &$dataset[random_int(0, count($dataset) - 1)]; + $c = count(config::CAPTCHA_DATASET); + $choosen = config::CAPTCHA_DATASET[random_int(0, $c - 1)]; $choices = []; - for($i=0; $i'; + }else{ + + $replacements["style"] = ""; + } + + if(isset($_COOKIE["scraper_ac"])){ + + $replacements["ac"] = '?ac=' . htmlspecialchars($_COOKIE["scraper_ac"]); + }else{ + + $replacements["ac"] = ''; + } + $handle = fopen("template/{$template}", "r"); $data = fread($handle, filesize("template/{$template}")); fclose($handle); @@ -29,30 +64,6 @@ class frontend{ return trim($html); } - public function getthemeclass($raw = true){ - - if( - isset($_COOKIE["theme"]) && - $_COOKIE["theme"] == "cream" - ){ - - $body_class = "theme-white "; - }else{ - - $body_class = ""; - } - - if( - $raw && - $body_class != "" - ){ - - return ' class="' . rtrim($body_class) . '"'; - } - - return $body_class; - } - public function loadheader(array $get, array $filters, string $page){ echo @@ -62,8 +73,7 @@ class frontend{ "index" => "no", "search" => htmlspecialchars($get["s"]), "tabs" => $this->generatehtmltabs($page, $get["s"]), - "filters" => $this->generatehtmlfilters($filters, $get), - "body_class" => $this->getthemeclass() + "filters" => $this->generatehtmlfilters($filters, $get) ]); if( @@ -74,18 +84,17 @@ class frontend{ ){ // bot detected !! - echo - $this->drawerror( - "Tshh, blocked!", - 'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running your own 4get instance or using the API.', - ); + $this->drawerror( + "Tshh, blocked!", + 'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running your own 4get instance or using the API.', + ); die(); } } public function drawerror($title, $error){ - return + echo $this->load("search.html", [ "class" => "", "right-left" => "", @@ -96,6 +105,23 @@ class frontend{ $error . '' ]); + die(); + } + + public function drawscrapererror($error, $get, $target){ + + $this->drawerror( + "Shit", + 'This scraper returned an error:' . + '
' . htmlspecialchars($error) . '
' . + 'Things you can try:' . + '
' . + 'If the error persists, please contact the administrator.' + ); } public function drawtextresult($site, $greentext = null, $duration = null, $keywords, $tabindex = true, $customhtml = null){ @@ -819,30 +845,7 @@ class frontend{ public function getscraperfilters($page){ - $get_scraper = null; - - switch($page){ - - case "web": - $get_scraper = isset($_COOKIE["scraper_web"]) ? $_COOKIE["scraper_web"] : null; - break; - - case "images": - $get_scraper = isset($_COOKIE["scraper_images"]) ? $_COOKIE["scraper_images"] : null; - break; - - case "videos": - $get_scraper = isset($_COOKIE["scraper_videos"]) ? $_COOKIE["scraper_videos"] : null; - break; - - case "news": - $get_scraper = isset($_COOKIE["scraper_news"]) ? $_COOKIE["scraper_news"] : null; - break; - - case "music": - $get_scraper = isset($_COOKIE["scraper_news"]) ? $_COOKIE["scraper_news"] : null; - break; - } + $get_scraper = isset($_COOKIE["scraper_$page"]) ? $_COOKIE["scraper_$page"] : null; if( isset($_GET["scraper"]) && @@ -1148,32 +1151,8 @@ class frontend{ break; case "_SEARCH": - - // get search string & bang - $sanitized[$parameter] = trim($sanitized[$parameter]); - $sanitized["bang"] = ""; - - if( - strlen($sanitized[$parameter]) !== 0 && - $sanitized[$parameter][0] == "!" - ){ - - $sanitized[$parameter] = explode(" ", $sanitized[$parameter], 2); - - $sanitized["bang"] = trim($sanitized[$parameter][0]); - - if(count($sanitized[$parameter]) === 2){ - - $sanitized[$parameter] = trim($sanitized[$parameter][1]); - }else{ - - $sanitized[$parameter] = ""; - } - - $sanitized["bang"] = ltrim($sanitized["bang"], "!"); - } - - $sanitized[$parameter] = ltrim($sanitized[$parameter], "! \n\r\t\v\x00"); + // get search string + $sanitized["s"] = trim($sanitized[$parameter]); } } } diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php index 5c65417..cb5d38d 100644 --- a/lib/fuckhtml.php +++ b/lib/fuckhtml.php @@ -442,5 +442,3 @@ class fuckhtml{ return json_decode($json_out, true); } } - -?> diff --git a/lib/nextpage.php b/lib/nextpage.php deleted file mode 100644 index 7516667..0000000 --- a/lib/nextpage.php +++ /dev/null @@ -1,106 +0,0 @@ -scraper = $scraper; - } - - public function store($payload, $page){ - - $page = $page[0]; - $password = random_bytes(256); // 2048 bit - $salt = random_bytes(16); - $key = hash_pbkdf2("sha512", $password, $salt, 20000, 32, true); - $iv = - random_bytes( - openssl_cipher_iv_length("aes-256-gcm") - ); - - $tag = ""; - $out = openssl_encrypt($payload, "aes-256-gcm", $key, OPENSSL_RAW_DATA, $iv, $tag, "", 16); - - $key = apcu_inc("key", 1); - - apcu_store( - $page . "." . - $this->scraper . - (string)$key, - gzdeflate($salt.$iv.$out.$tag), - 900 // cache information for 15 minutes blaze it - ); - - return - $this->scraper . $key . "." . - rtrim(strtr(base64_encode($password), '+/', '-_'), '='); - } - - public function get($npt, $page){ - - $page = $page[0]; - $explode = explode(".", $npt, 2); - - if(count($explode) !== 2){ - - throw new Exception("Malformed nextPageToken!"); - } - - $apcu = $page . "." . $explode[0]; - $key = $explode[1]; - - $payload = apcu_fetch($apcu); - - if($payload === false){ - - throw new Exception("The nextPageToken is invalid or has expired!"); - } - - $key = - base64_decode( - str_pad( - strtr($key, '-_', '+/'), - strlen($key) % 4, - '=', - STR_PAD_RIGHT - ) - ); - - $payload = gzinflate($payload); - - $key = - hash_pbkdf2( - "sha512", - $key, - substr($payload, 0, 16), // salt - 20000, - 32, - true - ); - $ivlen = openssl_cipher_iv_length("aes-256-gcm"); - - $payload = - openssl_decrypt( - substr( - $payload, - 16 + $ivlen, - -16 - ), - "aes-256-gcm", - $key, - OPENSSL_RAW_DATA, - substr($payload, 16, $ivlen), - substr($payload, -16) - ); - - if($payload === false){ - - throw new Exception("The nextPageToken is invalid or has expired!"); - } - - // remove the key after using - apcu_delete($apcu); - - return $payload; - } -} diff --git a/music.php b/music.php index c95fb4c..5bc3e5f 100644 --- a/music.php +++ b/music.php @@ -3,6 +3,8 @@ /* Initialize random shit */ +include "data/config.php"; + include "lib/frontend.php"; $frontend = new frontend(); @@ -28,20 +30,7 @@ try{ }catch(Exception $error){ - echo - $frontend->drawerror( - "Shit", - 'This scraper returned an error:' . - '
' . htmlspecialchars($error->getMessage()) . '
' . - 'Things you can try:' . - '
    ' . - '
  • Use a different scraper
  • ' . - '
  • Remove keywords that could cause errors
  • ' . - '
  • Use another 4get instance
  • ' . - '

' . - 'If the error persists, please contact the administrator.' - ); - die(); + $frontend->drawscrapererror($error->getMessage(), $get, "music"); } $categories = [ diff --git a/news.php b/news.php index ff37489..9a237a4 100644 --- a/news.php +++ b/news.php @@ -3,6 +3,8 @@ /* Initialize random shit */ +include "data/config.php"; + include "lib/frontend.php"; $frontend = new frontend(); @@ -28,20 +30,7 @@ try{ }catch(Exception $error){ - echo - $frontend->drawerror( - "Shit", - 'This scraper returned an error:' . - '
' . htmlspecialchars($error->getMessage()) . '
' . - 'Things you can try:' . - '
    ' . - '
  • Use a different scraper
  • ' . - '
  • Remove keywords that could cause errors
  • ' . - '
  • Use another 4get instance
  • ' . - '

' . - 'If the error persists, please contact the administrator.' - ); - die(); + $frontend->drawscrapererror($error->getMessage(), $get, "news"); } /* diff --git a/opensearch.php b/opensearch.php new file mode 100644 index 0000000..632a533 --- /dev/null +++ b/opensearch.php @@ -0,0 +1,29 @@ +' . + '' . + '' . htmlspecialchars(config::SERVER_NAME) . '' . + 'UTF-8' . + '' . $domain . '/favicon.ico' . + ''; + +if( + isset($_GET["ac"]) && + is_string($_GET["ac"]) && + $_GET["ac"] != "disabled" +){ + + echo ''; +} + +echo ''; diff --git a/proxy.php b/proxy.php index b49fafd..563f378 100644 --- a/proxy.php +++ b/proxy.php @@ -1,5 +1,6 @@ fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("brave"); + include "lib/backend.php"; + $this->backend = new backend("brave"); } public function getfilters($page){ @@ -138,13 +138,20 @@ class brave{ "maybe" => "Maybe", "no" => "No" ] + ], + "spellcheck" => [ + "display" => "Spellcheck", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] ] ]; break; } } - private function get($url, $get = [], $nsfw, $country){ + private function get($proxy, $url, $get = [], $nsfw, $country){ switch($nsfw){ @@ -159,7 +166,7 @@ class brave{ } $headers = [ - "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -190,11 +197,12 @@ class brave{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ - throw new Exception(curl_error($curlproc)); } @@ -207,7 +215,9 @@ class brave{ if($get["npt"]){ // get next page data - $q = json_decode($this->nextpage->get($get["npt"], "web"), true); + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + + $q = json_decode($q, true); $search = $q["q"]; $q["spellcheck"] = "0"; @@ -222,7 +232,6 @@ class brave{ // get _GET data instead $search = $get["s"]; - if(strlen($search) === 0){ throw new Exception("Search term is empty!"); @@ -230,9 +239,10 @@ class brave{ if(strlen($search) > 2048){ - throw new Exception("Search query is too long!"); + throw new Exception("Search term is too long!"); } + $proxy = $this->backend->get_ip(); $nsfw = $get["nsfw"]; $country = $get["country"]; $older = $get["older"]; @@ -288,6 +298,7 @@ class brave{ try{ $html = $this->get( + $proxy, "https://search.brave.com/search", $q, $nsfw, @@ -361,9 +372,10 @@ class brave{ $q["country"] = $country; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($q), - "web" + "web", + $proxy ); } } @@ -759,7 +771,9 @@ class brave{ "description" => isset($result["review"]["description"]) ? $this->limitstrlen( - $result["review"]["description"] + strip_tags( + $result["review"]["description"] + ) ) : $this->titledots( $this->fuckhtml @@ -839,6 +853,32 @@ class brave{ "value" => $this->titledots($info["long_desc"]) ]; } + + // parse ratings + if( + isset($info["ratings"]) && + $info["ratings"] != "void 0" + ){ + + $description[] = [ + "type" => "title", + "value" => "Ratings" + ]; + + foreach($info["ratings"] as $rating){ + + $description[] = [ + "type" => "link", + "url" => $rating["profile"]["url"], + "value" => $rating["profile"]["name"] + ]; + + $description[] = [ + "type" => "text", + "value" => ": " . $rating["ratingValue"] . "/" . $rating["bestRating"] . "\n" + ]; + } + } } $table = []; @@ -908,9 +948,9 @@ class brave{ $out["video"][] = [ "title" => $this->titledots($video["title"]), "description" => $this->titledots($video["description"]), - "date" => isset($video["age"]) ? strtotime($video["age"]) : null, - "duration" => isset($video["video"]["duration"]) ? $this->hms2int($video["video"]["duration"]) : null, - "views" => null, + "date" => isset($video["age"]) && $video["age"] != "void 0" ? strtotime($video["age"]) : null, + "duration" => isset($video["video"]["duration"]) && $video["video"]["duration"] != "void 0" ? $this->hms2int($video["video"]["duration"]) : null, + "views" => isset($video["video"]["views"]) && $video["video"]["views"] != "void 0" ? (int)$video["video"]["views"] : null, "thumb" => isset($video["thumbnail"]["src"]) ? [ @@ -1008,37 +1048,75 @@ class brave{ public function news($get){ - $search = $get["s"]; - if(strlen($search) === 0){ + if($get["npt"]){ - throw new Exception("Search term is empty!"); - } - - $nsfw = $get["nsfw"]; - $country = $get["country"]; - - if(strlen($search) > 2048){ + [$req, $proxy] = $this->backend->get($get["npt"], "news"); - throw new Exception("Search query is too long!"); - } - /* - $handle = fopen("scraper/brave-news.html", "r"); - $html = fread($handle, filesize("scraper/brave-news.html")); - fclose($handle);*/ - try{ - $html = - $this->get( - "https://search.brave.com/news", - [ - "q" => $search - ], - $nsfw, - $country - ); + $req = json_decode($req, true); - }catch(Exception $error){ + $search = $req["q"]; + $country = $req["country"]; + $nsfw = $req["nsfw"]; + $offset = $req["offset"]; + $spellcheck = $req["spellcheck"]; - throw new Exception("Could not fetch search page"); + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/news", + [ + "q" => $search, + "offset" => $offset, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $proxy = $this->backend->get_ip(); + $nsfw = $get["nsfw"]; + $country = $get["country"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; + + /* + $handle = fopen("scraper/brave-news.html", "r"); + $html = fread($handle, filesize("scraper/brave-news.html")); + fclose($handle);*/ + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/news", + [ + "q" => $search, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } } $out = [ @@ -1050,6 +1128,17 @@ class brave{ // load html $this->fuckhtml->load($html); + // get npt + $out["npt"] = + $this->generatenextpagetoken( + $search, + $nsfw, + $country, + $spellcheck, + "news", + $proxy + ); + $news = $this->fuckhtml ->getElementsByClassName( @@ -1183,8 +1272,19 @@ class brave{ public function image($get){ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + $country = $get["country"]; $nsfw = $get["nsfw"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; $out = [ "status" => "ok", @@ -1195,9 +1295,11 @@ class brave{ try{ $html = $this->get( + $this->backend->get_ip(), // no nextpage right now, pass proxy directly "https://search.brave.com/images", [ - "q" => $search + "q" => $search, + "spellcheck" => $spellcheck ], $nsfw, $country @@ -1261,9 +1363,75 @@ class brave{ public function video($get){ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; + if($get["npt"]){ + + [$npt, $proxy] = $this->backend->get($get["npt"], "videos"); + + $npt = json_decode($npt, true); + $search = $npt["q"]; + $offset = $npt["offset"]; + $spellcheck = $npt["spellcheck"]; + $country = $npt["country"]; + $nsfw = $npt["nsfw"]; + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/videos", + [ + "q" => $search, + "offset" => $offset, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; + + $proxy = $this->backend->get_ip(); + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/videos", + [ + "q" => $search, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + } + + $this->fuckhtml->load($html); $out = [ "status" => "ok", @@ -1275,21 +1443,17 @@ class brave{ "reel" => [] ]; - try{ - $html = - $this->get( - "https://search.brave.com/videos", - [ - "q" => $search - ], - $nsfw, - $country - ); - - }catch(Exception $error){ - - throw new Exception("Could not fetch search page"); - } + // get npt + $out["npt"] = + $this->generatenextpagetoken( + $search, + $nsfw, + $country, + $spellcheck, + "videos", + $proxy + ); + /* $handle = fopen("scraper/brave-video.html", "r"); $html = fread($handle, filesize("scraper/brave-video.html")); @@ -1606,7 +1770,7 @@ class brave{ $data["table"][trim($html[0])] = trim($html[1]); } } - + /* private function getimagelinkfromstyle($thumb){ $thumb = @@ -1646,13 +1810,13 @@ class brave{ "url" => $url, "ratio" => "16:9" ]; - } + }*/ private function limitstrlen($text){ return explode("\n", wordwrap($text, 300, "\n"))[0]; } - + /* private function limitwhitespace($text){ return @@ -1661,7 +1825,7 @@ class brave{ " ", $text ); - } + }*/ private function titledots($title){ @@ -1678,6 +1842,52 @@ class brave{ return trim($title); } + private function generatenextpagetoken($q, $nsfw, $country, $spellcheck, $page, $proxy){ + + $nextpage = + $this->fuckhtml + ->getElementsByClassName("btn", "a"); + + if(count($nextpage) !== 0){ + + $nextpage = + $nextpage[count($nextpage) - 1]; + + if( + strtolower( + $this->fuckhtml + ->getTextContent( + $nextpage + ) + ) == "next" + ){ + + preg_match( + '/offset=([0-9]+)/', + $this->fuckhtml->getTextContent($nextpage["attributes"]["href"]), + $nextpage + ); + + return + $this->backend->store( + json_encode( + [ + "q" => $q, + "offset" => (int)$nextpage[1], + "nsfw" => $nsfw, + "country" => $country, + "spellcheck" => $spellcheck + ] + ), + $page, + $proxy + ); + } + } + + return null; + } + private function unshiturl($url){ // https://imgs.search.brave.com/XFnbR8Sl7ge82MBDEH7ju0UHImRovMVmQ2qnDvgNTuA/rs:fit:844:225:1/g:ce/aHR0cHM6Ly90c2U0/Lm1tLmJpbmcubmV0/L3RoP2lkPU9JUC54/UWotQXU5N2ozVndT/RDJnNG9BNVhnSGFF/SyZwaWQ9QXBp.jpeg diff --git a/scraper/ddg.php b/scraper/ddg.php index 1ce8e18..2d737ba 100644 --- a/scraper/ddg.php +++ b/scraper/ddg.php @@ -4,8 +4,11 @@ class ddg{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("ddg"); + include "lib/backend.php"; + $this->backend = new backend("ddg"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); } /* @@ -14,7 +17,7 @@ class ddg{ private const req_web = 0; private const req_xhr = 1; - private function get($url, $get = [], $reqtype = self::req_web){ + private function get($proxy, $url, $get = [], $reqtype = self::req_web){ $curlproc = curl_init(); @@ -28,7 +31,7 @@ class ddg{ switch($reqtype){ case self::req_web: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -43,7 +46,7 @@ class ddg{ case self::req_xhr: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -57,6 +60,8 @@ class ddg{ break; } + $this->backend->assign_proxy($curlproc, $proxy); + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); @@ -69,7 +74,6 @@ class ddg{ $data = curl_exec($curlproc); if(curl_errno($curlproc)){ - throw new Exception(curl_error($curlproc)); } @@ -541,9 +545,11 @@ class ddg{ public function web($get){ + $proxy = null; + if($get["npt"]){ - $jsgrep = $this->nextpage->get($get["npt"], "web"); + [$jsgrep, $proxy] = $this->backend->get($get["npt"], "web"); $extendedsearch = false; $inithtml = ""; @@ -555,6 +561,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $older = $get["older"]; @@ -614,9 +621,9 @@ class ddg{ /* Get html */ - // https://duckduckgo.com/?q=minecraft&kz=1&k1=-1&kp=-2 try{ $inithtml = $this->get( + $proxy, "https://duckduckgo.com/", $get_filters ); @@ -643,6 +650,7 @@ class ddg{ try{ $js = $this->get( + $proxy, "https://links.duckduckgo.com" . $jsgrep, [], ddg::req_xhr @@ -692,6 +700,7 @@ class ddg{ // get definition $wordnikjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/dictionary/definition/" . $wordnik, [], ddg::req_xhr @@ -725,6 +734,7 @@ class ddg{ $wordnikaudio_json = json_decode( $this->get( + $proxy, "https://duckduckgo.com/js/spice/dictionary/audio/" . $wordnik, [], ddg::req_xhr @@ -922,6 +932,7 @@ class ddg{ try{ $stackjs = $this->get( + $proxy, "https://duckduckgo.com" . $stack, [], ddg::req_xhr @@ -944,7 +955,7 @@ class ddg{ $out["answer"][] = [ "title" => $stackjson["Heading"], - "description" => $this->htmltoarray($stackjson["Abstract"]), + "description" => $this->stackoverflow_parse($stackjson["Abstract"]), "url" => str_replace(["http://", "ddg"], ["https://", ""], $stackjson["AbstractURL"]), "thumb" => null, "table" => [], @@ -973,6 +984,7 @@ class ddg{ try{ $lyricsjs = $this->get( + $proxy, "https://duckduckgo.com" . $lyrics, [], ddg::req_xhr @@ -1166,13 +1178,13 @@ class ddg{ if(isset($answers[$i]["data"]["AbstractText"]) && !empty($answers[$i]["data"]["AbstractText"])){ - $description = $this->htmltoarray($answers[$i]["data"]["AbstractText"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["AbstractText"]); }elseif(isset($answers[$i]["data"]["Abstract"]) && !empty($answers[$i]["data"]["Abstract"])){ - $description = $this->htmltoarray($answers[$i]["data"]["Abstract"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["Abstract"]); }elseif(isset($answers[$i]["data"]["Answer"]) && !empty($answers[$i]["data"]["Answer"])){ - $description = $this->htmltoarray($answers[$i]["data"]["Answer"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["Answer"]); }else{ $description = []; @@ -1310,6 +1322,7 @@ class ddg{ $description = []; $shitcoinjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/cryptocurrency/{$shitcoins[1]}/{$shitcoins[2]}/1", [], ddg::req_xhr @@ -1408,6 +1421,7 @@ class ddg{ try{ $currencyjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/currency/{$amount}/" . strtolower($currencies[1]) . "/" . strtolower($currencies[2]), [], ddg::req_xhr @@ -1607,7 +1621,7 @@ class ddg{ // store next page token if(isset($web[$i]["n"])){ - $out["npt"] = $this->nextpage->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web"); + $out["npt"] = $this->backend->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web", $proxy); continue; } @@ -1874,10 +1888,11 @@ class ddg{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "images"); + [$npt, $proxy] = $this->backend->get($get["npt"], "images"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/i.js?" . $npt, [], ddg::req_xhr @@ -1895,6 +1910,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -1934,6 +1950,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_filters, ddg::req_web @@ -1980,6 +1997,7 @@ class ddg{ try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/i.js", $js_params, ddg::req_xhr @@ -2005,10 +2023,11 @@ class ddg{ } $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1] . "&vqd=" . $vqd, - "images" + "images", + $proxy ); } @@ -2046,10 +2065,11 @@ class ddg{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "videos"); + [$npt, $proxy] = $this->backend->get($get["npt"], "videos"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/v.js?" . $npt, [], @@ -2068,6 +2088,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -2099,6 +2120,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_filters, ddg::req_web @@ -2123,6 +2145,7 @@ class ddg{ try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/v.js", [ "l" => "us-en", @@ -2155,9 +2178,10 @@ class ddg{ if(isset($json["next"])){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1], - "videos" + "videos", + $proxy ); } @@ -2213,11 +2237,12 @@ class ddg{ if($get["npt"]){ - $req = $this->nextpage->get($get["npt"], "news"); + [$req, $proxy] = $this->backend->get($get["npt"], "news"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/news.js?" . $req, [], @@ -2236,6 +2261,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -2261,6 +2287,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_params, ddg::req_web @@ -2303,6 +2330,7 @@ class ddg{ } $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/news.js", $js_params, ddg::req_xhr @@ -2323,9 +2351,10 @@ class ddg{ if(isset($json["next"])){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1], - "news" + "news", + $proxy ); } @@ -2415,192 +2444,193 @@ class ddg{ return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]); } - private function htmltoarray($html){ + private function appendtext($payload, &$text, &$index){ - $html = strip_tags($html, ["img", "pre", "code", "br", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "a"]); + if(trim($payload) == ""){ + + return; + } - libxml_use_internal_errors(true); - $dom = new DOMDocument("1.0", "utf-8"); - $dom->loadHTML('
' . $html . '
'); - $xpath = new DOMXPath($dom); - $descendants = $xpath->query('//div/node()'); + if( + $index !== 0 && + $text[$index - 1]["type"] == "text" + ){ + + $text[$index - 1]["value"] .= preg_replace('/ $/', " ", $payload); + }else{ + + $text[] = [ + "type" => "text", + "value" => preg_replace('/ $/', " ", $payload) + ]; + $index++; + } + } + + private function stackoverflow_parse($html){ - $images = $xpath->query('//div/node()/img'); - $imageiterator = 0; + $i = 0; + $answer = []; - if(count($descendants) === 0){ + $this->fuckhtml->load($html); + + $tags = $this->fuckhtml->getElementsByTagName("*"); + + if(count($tags) === 0){ return [ - "type" => "text", - "value" => $this->unescapehtml($html) + [ + "type" => "text", + "value" => htmlspecialchars_decode($html) + ] ]; } - $array = []; - $previoustype = null; - - foreach($descendants as $node){ + foreach($tags as $snippet){ - // $node->nodeValue = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $node->nodeValue); - - // get node type - switch($node->nodeName){ - case "#text": - $type = "text"; - break; + switch($snippet["tagName"]){ - case "pre": - $type = "code"; - break; - - case "code": - $type = "inline_code"; - break; - - case "h1": - case "h2": - case "h3": - case "h4": - case "h5": - case "h6": - $type = "title"; - break; - - case "blockquote": - $type = "quote"; - break; - - case "a": - $type = "link"; + case "p": + $this->fuckhtml->load($snippet["innerHTML"]); + + $codetags = + $this->fuckhtml + ->getElementsByTagName("*"); + + $tmphtml = $snippet["innerHTML"]; + + foreach($codetags as $tag){ + + if(!isset($tag["outerHTML"])){ + + continue; + } + + $tmphtml = + explode( + $tag["outerHTML"], + $tmphtml, + 2 + ); + + $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); + $this->appendtext($value, $answer, $i); + + $type = null; + switch($tag["tagName"]){ + + case "code": $type = "inline_code"; break; + case "em": $type = "italic"; break; + case "blockquote": $type = "quote"; break; + default: $type = "text"; + } + + if($type !== null){ + $value = $this->fuckhtml->getTextContent($tag, false, false); + + if(trim($value) != ""){ + + $answer[] = [ + "type" => $type, + "value" => rtrim($value) + ]; + $i++; + } + } + + if(count($tmphtml) === 2){ + + $tmphtml = $tmphtml[1] . "\n"; + }else{ + + break; + } + } + + if(is_array($tmphtml)){ + + $tmphtml = $tmphtml[0]; + } + + if(strlen($tmphtml) !== 0){ + + $value = $this->fuckhtml->getTextContent($tmphtml, true, false); + $this->appendtext($value, $answer, $i); + } break; case "img": - $type = "image"; - break; - } - - // add node to array - switch($type){ - - case "text": - $value = preg_replace( - '/ {2,}/', - " ", - $this->limitnewlines($this->unescapehtml($node->textContent)) - ); - - if( - $previoustype == "quote" || - $previoustype === null || - $previoustype == "image" || - $previoustype == "title" || - $previoustype == "code" - ){ - - $value = ltrim($value); - } - - if($value == ""){ - - $previoustype = $type; - continue 2; - } - - // merge with previous text node - if($previoustype == "text"){ - - $array[count($array) - 1]["value"] = trim($array[count($array) - 1]["value"]) . "\n" . $this->bstoutf8($value); - }else{ - - $array[] = [ - "type" => "text", - "value" => $this->bstoutf8($value) - ]; - } - break; - - case "inline_code": - case "bold": - $array[] = [ - "type" => "inline_code", - "value" => $this->bstoutf8(trim($this->limitnewlines($this->unescapehtml($node->textContent)))) - ]; - break; - - case "link": - // check for link nested inside of image - - if(strlen($node->childNodes->item(0)->textContent) !== 0){ - - $array[] = [ - "type" => "link", - "value" => $this->bstoutf8(trim($this->unescapehtml($node->textContent))), - "url" => $this->bstoutf8(preg_replace('/\/ddg$/', "", preg_replace('/^http:\/\//', "https://", $this->sanitizeurl($node->getAttribute("href"))))) - ]; - break; - } - - $type = "image"; - - if($previoustype == "text"){ - - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); - } - - $array[] = [ + $answer[] = [ "type" => "image", - "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $images->item($imageiterator)->getAttribute("src")))) + "url" => + $this->fuckhtml + ->getTextContent( + $tag["attributes"]["src"] + ) ]; + $i++; + break; + + case "pre": + switch($answer[$i - 1]["type"]){ + + case "text": + case "italic": + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); + break; + } - $imageiterator++; + $answer[] = + [ + "type" => "code", + "value" => + rtrim( + $this->fuckhtml + ->getTextContent( + $snippet, + true, + false + ) + ) + ]; + $i++; break; - case "image": + case "ol": + $o = 0; - if($previoustype == "text"){ - - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); - } + $this->fuckhtml->load($snippet); + $li = + $this->fuckhtml + ->getElementsByTagName("li"); - $array[] = [ - "type" => "image", - "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $node->getAttribute("src")))) - ]; - break; - - case "quote": - case "title": - case "code": - if($previoustype == "text"){ + foreach($li as $elem){ + $o++; - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); - } - // no break - - default: - - $value = trim($this->limitnewlines($this->unescapehtml($node->textContent))); - if($type != "code"){ - - $value = preg_replace( - '/ {2,}/', - " ", - $value + $this->appendtext( + $o . ". " . + $this->fuckhtml + ->getTextContent( + $elem + ), + $answer, + $i ); } - - $array[] = [ - "type" => $type, - "value" => $this->bstoutf8($value) - ]; break; } - - $previoustype = $type; } - return $array; + if( + $i !== 0 && + $answer[$i - 1]["type"] == "text" + ){ + + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); + } + + return $answer; } private function bstoutf8($bs){ diff --git a/scraper/facebook.php b/scraper/facebook.php index 7bd576b..395a863 100644 --- a/scraper/facebook.php +++ b/scraper/facebook.php @@ -9,6 +9,9 @@ class facebook{ include "lib/nextpage.php"; $this->nextpage = new nextpage("fb"); + + include "lib/proxy_pool.php"; + $this->proxy = new proxy_pool("facebook"); } public function getfilters($page){ @@ -104,6 +107,8 @@ class facebook{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->proxy->assign_proxy($curlproc); $data = curl_exec($curlproc); diff --git a/scraper/ftm.php b/scraper/ftm.php index af39c12..0cdfbb3 100644 --- a/scraper/ftm.php +++ b/scraper/ftm.php @@ -4,8 +4,8 @@ class ftm{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("ftm"); + include "lib/backend.php"; + $this->backend = new backend("ftm"); } public function getfilters($page){ @@ -13,7 +13,7 @@ class ftm{ return []; } - private function get($url, $search, $offset){ + private function get($proxy, $url, $search, $offset){ $curlproc = curl_init(); @@ -29,7 +29,7 @@ class ftm{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -56,6 +56,8 @@ class ftm{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -70,8 +72,6 @@ class ftm{ public function image($get){ - $search = $get["s"]; - $out = [ "status" => "ok", "npt" => null, @@ -80,16 +80,28 @@ class ftm{ if($get["npt"]){ - $count = (int)$this->nextpage->get($get["npt"], "images"); + [$data, $proxy] = $this->backend->get($get["npt"], "images"); + $data = json_decode($data, true); + + $count = $data["count"]; + $search = $data["search"]; }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + $count = 0; + $proxy = $this->backend->get_ip(); } try{ $json = json_decode( $this->get( + $proxy, "https://findthatmeme.com/api/v1/search", $search, $count @@ -134,14 +146,15 @@ class ftm{ ]; } - if($count === 50){ - - $out["npt"] = - $this->nextpage->store( - $count, - "images" - ); - } + $out["npt"] = + $this->backend->store( + json_encode([ + "count" => $count, + "search" => $search + ]), + "images", + $proxy + ); return $out; } diff --git a/scraper/google.php b/scraper/google.php index ca77231..055d12a 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -10,8 +10,8 @@ class google{ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("google"); + include "lib/backend.php"; + $this->backend = new backend("google"); } public function getfilters($page){ @@ -727,7 +727,7 @@ class google{ } } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $headers = [ "User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.3; pt-pt; LG-P500h-parrot Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2", @@ -760,6 +760,8 @@ class google{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -771,7 +773,7 @@ class google{ curl_close($curlproc); return $data; } - + /* public function web($get){ $search = $get["s"]; @@ -877,9 +879,9 @@ class google{ if(count($title) !== 0){ - /* - Container is a web link - */ + // + // Container is a web link + // $web = [ "title" => $this->titledots( @@ -1051,9 +1053,9 @@ class google{ continue; } - /* - Parse rating object - */ + // + // Parse rating object + // if($is_rating >= -1){ @@ -1102,9 +1104,9 @@ class google{ continue; } - /* - Parse standalone text - */ + // + // Parse standalone text + // $additional_info[] = $innertext; } } @@ -1194,9 +1196,9 @@ class google{ $container_title == "people also search for" ){ - /* - Parse related searches - */ + // + // Parse related searches + // $as = $this->fuckhtml ->getElementsByTagName("a"); @@ -1212,9 +1214,9 @@ class google{ continue; } - /* - Parse image carousel - */ + // + // Parse image carousel + // $title_container = $this->fuckhtml ->getElementsByClassName( @@ -1239,9 +1241,9 @@ class google{ if($title_container == "imagesview all"){ - /* - Image carousel - */ + // + // Image carousel + // $pcitem = $this->fuckhtml ->getElementsByClassName( @@ -1316,9 +1318,9 @@ class google{ } } - /* - Get next page - */ + // + // Get next page + // $as = $this->fuckhtml ->getElementsByTagName("a"); @@ -1340,7 +1342,7 @@ class google{ } return $out; - } + }*/ public function image($get){ @@ -1348,17 +1350,22 @@ class google{ // generate parameters if($get["npt"]){ - $params = - json_decode( - $this->nextpage->get( - $get["npt"], - "images" - ), - true + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "images" ); + + $params = json_decode($params, true); }else{ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $lang = $get["lang"]; @@ -1475,6 +1482,7 @@ class google{ try{ $html = $this->get( + $proxy, "https://www.google.com/search", $params ); @@ -1578,9 +1586,10 @@ class google{ $params["ijn"] = (int)$params["ijn"] + 1; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "images" + "images", + $proxy ); }else{ @@ -1628,9 +1637,10 @@ class google{ $params["imgvl"] = $imgvl; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "images" + "images", + $proxy ); } } diff --git a/scraper/imgur.php b/scraper/imgur.php index 4a16de7..23efe00 100644 --- a/scraper/imgur.php +++ b/scraper/imgur.php @@ -4,11 +4,11 @@ class imgur{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("imgur"); - include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("imgur"); } public function getfilters($page){ @@ -57,7 +57,7 @@ class imgur{ ]; } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); @@ -70,7 +70,7 @@ class imgur{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -89,6 +89,8 @@ class imgur{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -105,15 +107,14 @@ class imgur{ if($get["npt"]){ - $filter = - json_decode( - $this->nextpage->get( - $get["npt"], - "images" - ), - true + [$filter, $proxy] = + $this->backend->get( + $get["npt"], + "images" ); + $filter = json_decode($filter, true); + $search = $filter["s"]; unset($filter["s"]); @@ -134,6 +135,12 @@ class imgur{ }else{ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $sort = $get["sort"]; $time = $get["time"]; $format = $get["format"]; @@ -165,6 +172,7 @@ class imgur{ try{ $html = $this->get( + $proxy, "https://imgur.com/search/$sort/$time/page/$page", $filter ); @@ -238,9 +246,10 @@ class imgur{ $filter["page"] = $page + 1; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($filter), - "images" + "images", + $proxy ); } diff --git a/scraper/marginalia.php b/scraper/marginalia.php index c8ab09f..b790a97 100644 --- a/scraper/marginalia.php +++ b/scraper/marginalia.php @@ -3,7 +3,8 @@ class marginalia{ public function __construct(){ - $this->key = "public"; + include "lib/backend.php"; + $this->backend = new backend("marginalia"); } public function getfilters($page){ @@ -76,10 +77,10 @@ class marginalia{ } } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $headers = [ - "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -109,6 +110,8 @@ class marginalia{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -124,6 +127,11 @@ class marginalia{ public function web($get){ $search = [$get["s"]]; + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + $profile = $get["profile"]; $format = $get["format"]; $file = $get["file"]; @@ -184,7 +192,8 @@ class marginalia{ try{ $json = $this->get( - "https://api.marginalia.nu/{$this->key}/search/" . urlencode($search), + $this->backend->get_ip(), // no nextpage + "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search), $params ); }catch(Exception $error){ diff --git a/scraper/mojeek.php b/scraper/mojeek.php index e7e8abc..3d91c09 100644 --- a/scraper/mojeek.php +++ b/scraper/mojeek.php @@ -6,8 +6,8 @@ class mojeek{ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("mojeek"); + include "lib/backend.php"; + $this->backend = new backend("mojeek"); } public function getfilters($page){ @@ -371,10 +371,10 @@ class mojeek{ } } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $headers = [ - "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -404,6 +404,8 @@ class mojeek{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -420,11 +422,12 @@ class mojeek{ if($get["npt"]){ - $token = $this->nextpage->get($get["npt"], "web"); + [$token, $proxy] = $this->backend->get($get["npt"], "web"); try{ $html = $this->get( + $proxy, "https://www.mojeek.com" . $token, [] ); @@ -485,9 +488,12 @@ class mojeek{ $params["si"] = $domain; } + $proxy = $this->backend->get_ip(); + try{ $html = $this->get( + $proxy, "https://www.mojeek.com/search", $params ); @@ -529,88 +535,90 @@ class mojeek{ return $out; } - $this->fuckhtml->load($results[0]); - /* - Get search results + Get all search result divs */ - $results = - $this->fuckhtml - ->getElementsByTagName("li"); - - foreach($results as $result){ + foreach($results as $container){ - $data = [ - "title" => null, - "description" => null, - "url" => null, - "date" => null, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - - $this->fuckhtml->load($result); - - $title = + $this->fuckhtml->load($container); + $results = $this->fuckhtml - ->getElementsByClassName("title", "a")[0]; + ->getElementsByTagName("li"); - $data["title"] = - html_entity_decode( - $this->fuckhtml - ->getTextContent( - $title["innerHTML"] - ) - ); - - $data["url"] = - html_entity_decode( - $this->fuckhtml - ->getTextContent( - $title["attributes"]["href"] - ) - ); - - $description = - $this->fuckhtml - ->getElementsByClassName( - "s", "p" - ); - - if(count($description) !== 0){ + foreach($results as $result){ - $data["description"] = - $this->titledots( - html_entity_decode( - $this->fuckhtml - ->getTextContent( - $description[0] - ) + $data = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $this->fuckhtml->load($result); + + $title = + $this->fuckhtml + ->getElementsByClassName("title", "a")[0]; + + $data["title"] = + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $title["innerHTML"] ) ); - } - - $data["date"] = - explode( - " - ", - $this->fuckhtml - ->getTextContent( + + $data["url"] = + html_entity_decode( $this->fuckhtml - ->getElementsByClassName("i", "p")[1] - ) - ); - - $data["date"] = - strtotime( - $data["date"][count($data["date"]) - 1] - ); - - $out["web"][] = $data; + ->getTextContent( + $title["attributes"]["href"] + ) + ); + + $description = + $this->fuckhtml + ->getElementsByClassName( + "s", "p" + ); + + if(count($description) !== 0){ + + $data["description"] = + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ); + } + + $data["date"] = + explode( + " - ", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName("i", "p")[1] + ) + ); + + $data["date"] = + strtotime( + $data["date"][count($data["date"]) - 1] + ); + + $out["web"][] = $data; + } } /* @@ -969,12 +977,13 @@ class mojeek{ if($a["innerHTML"] == "Next"){ - $out["npt"] = $this->nextpage->store( + $out["npt"] = $this->backend->store( $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ), - "web" + "web", + $proxy ); } } @@ -1001,6 +1010,7 @@ class mojeek{ try{ $html = $this->get( + $this->backend->get_ip(), "https://www.mojeek.com/search", [ "q" => $search, @@ -1011,168 +1021,139 @@ class mojeek{ throw new Exception("Failed to get HTML"); } - /* $handle = fopen("scraper/mojeek.html", "r"); $html = fread($handle, filesize("scraper/mojeek.html")); - fclose($handle);*/ - - /* - Get big, standard and smaller nodes + fclose($handle); */ - foreach( - [ - "results-extended", - "results-standard" - ] - as $categoryname - ){ + + $this->fuckhtml->load($html); + + $articles = + $this->fuckhtml->getElementsByTagName("article"); + + foreach($articles as $article){ - $this->fuckhtml->load($html); + $this->fuckhtml->load($article); - $categories = + $data = [ + "title" => null, + "author" => null, + "description" => null, + "date" => null, + "thumb" => + [ + "url" => null, + "ratio" => null + ], + "url" => null + ]; + + $a = $this->fuckhtml->getElementsByTagName("a")[0]; + + $data["title"] = $this->fuckhtml - ->getElementsByClassName( - $categoryname, - "ul" + ->getTextContent( + $a["attributes"]["title"] ); - - foreach($categories as $category){ - - $this->fuckhtml->load($category); - - $nodes = + + $data["url"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + + $p = $this->fuckhtml->getElementsByTagName("p"); + + $data["description"] = + $this->titledots( $this->fuckhtml - ->getElementsByTagName("li"); - - foreach($nodes as $node){ - - $data = [ - "title" => null, - "author" => null, - "description" => null, - "date" => null, - "thumb" => - [ - "url" => null, - "ratio" => null - ], - "url" => null - ]; - - /* - Parse the results - */ - $this->fuckhtml->load($node); - - // get title + url - $a = - $this->fuckhtml - ->getElementsByTagName("a")[0]; - - $data["title"] = - $this->fuckhtml - ->getTextContent( - $a["attributes"]["title"] - ); - - $data["url"] = - $this->fuckhtml - ->getTextContent( - $a["attributes"]["href"] - ); - - // get image - $image = - $this->fuckhtml - ->getElementsByTagName("img"); - - if(count($image) !== 0){ - - $data["thumb"] = [ - "url" => - urldecode( - str_replace( - "/image?img=", - "", - $this->fuckhtml - ->getTextContent( - $image[0]["attributes"]["src"] - ) - ) - ), - "ratio" => "16:9" - ]; - } - - // get description - $description = - $this->fuckhtml - ->getElementsByClassName("s", "p"); - - if(count($description) !== 0){ - - $data["description"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - } - - // get date + time - $date = + ->getTextContent( $this->fuckhtml ->getElementsByClassName( - "date", - "p" - ); - - $i = - $this->fuckhtml - ->getElementsByClassName("i", "p"); - - if(count($date) !== 0){ - - // we're inside a big node - $data["date"] = strtotime($date[0]["innerHTML"]); - - if(count($i) !== 0){ - - $this->fuckhtml->load($i[0]); - - $a = - $this->fuckhtml - ->getElementsByTagName("a"); - - if(count($a) !== 0){ - - $data["author"] = - $this->fuckhtml - ->getTextContent($a[0]); - } - } - }else{ - - // we're inside a small node - if(count($i) !== 0){ - - $i = - explode( - " - ", - $this->fuckhtml - ->getTextContent($i[0]) - ); - - $data["date"] = strtotime(array_pop($i)); - $data["author"] = implode(" - ", $i); - } - } - - $out["news"][] = $data; - } + "s", + $p + )[0] + ) + ); + + if($data["description"] == ""){ + + $data["description"] = null; } + + // get date from big node + $date = + $this->fuckhtml + ->getElementsByClassName( + "date", + $p + ); + + if(count($date) !== 0){ + + $data["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + } + + // grep date + author + $s = + $this->fuckhtml + ->getElementsByClassName( + "i", + $p + )[0]; + + $this->fuckhtml->load($s); + + $a = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($a) !== 0){ + + // parse big node information + $data["author"] = + $this->fuckhtml + ->getTextContent( + $a[0]["innerHTML"] + ); + }else{ + + // parse smaller nodes + $replace = + $this->fuckhtml + ->getElementsByTagName("time")[0]; + + $data["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $replace + ) + ); + + $s["innerHTML"] = + str_replace( + $replace["outerHTML"], + "", + $s["innerHTML"] + ); + + $data["author"] = + preg_replace( + '/ • $/', + "", + $s["innerHTML"] + ); + } + + $out["news"][] = $data; } return $out; diff --git a/scraper/pinterest.php b/scraper/pinterest.php index 2bb5b71..37473a1 100644 --- a/scraper/pinterest.php +++ b/scraper/pinterest.php @@ -6,6 +6,9 @@ class pinterest{ include "lib/nextpage.php"; $this->nextpage = new nextpage("pinterest"); + + include "lib/proxy_pool.php"; + $this->proxy = new proxy_pool("pinterest"); } public function getfilters($page){ @@ -44,6 +47,8 @@ class pinterest{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->proxy->assign_proxy($curlproc); $data = curl_exec($curlproc); diff --git a/scraper/sc.php b/scraper/sc.php index 1f49f95..16d3931 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -4,10 +4,8 @@ class sc{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("sc"); - $this->client_id = "ArYppSEotE3YiXCO4Nsgid2LLqJutiww"; - $this->user_id = "766585-580597-163310-929698"; + include "lib/backend.php"; + $this->backend = new backend("sc"); } public function getfilters($page){ @@ -27,7 +25,7 @@ class sc{ ]; } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); @@ -40,7 +38,7 @@ class sc{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0", + ["User-Agent: " . config::USER_AGENT, "Accept: application/json, text/javascript, */*; q=0.01", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -58,6 +56,8 @@ class sc{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -74,7 +74,7 @@ class sc{ if($get["npt"]){ - $params = $this->nextpage->get($get["npt"], "music"); + [$params, $proxy] = $this->backend->get($get["npt"], "music"); $params = json_decode($params, true); $url = $params["url"]; @@ -101,7 +101,13 @@ class sc{ // https://api-v2.soundcloud.com/search/playlists_without_albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + $type = $get["type"]; + $proxy = $this->backend->get_ip(); switch($type){ @@ -111,8 +117,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "model", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -127,8 +133,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet_genre" => "", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -143,8 +149,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "place", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -159,8 +165,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "genre", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -175,8 +181,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "genre", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -192,8 +198,8 @@ class sc{ "variant_ids" => "", "filter.content_tier" => "SUB_HIGH_TIER", "facet" => "genre", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -206,7 +212,7 @@ class sc{ try{ - $json = $this->get($url, $params); + $json = $this->get($proxy, $url, $params); }catch(Exception $error){ @@ -244,9 +250,10 @@ class sc{ $params["url"] = $url; // we will remove this later $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "music" + "music", + $proxy ); } @@ -342,7 +349,7 @@ class sc{ "endpoint" => "audio_sc", "url" => $item["media"]["transcodings"][0]["url"] . - "?client_id=" . $this->client_id . + "?client_id=" . config::SC_CLIENT_TOKEN . "&track_authorization=" . $item["track_authorization"] ]; diff --git a/scraper/wiby.php b/scraper/wiby.php index a1daf57..e8351bc 100644 --- a/scraper/wiby.php +++ b/scraper/wiby.php @@ -4,8 +4,8 @@ class wiby{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("wiby"); + include "lib/backend.php"; + $this->backend = new backend("wiby"); } public function getfilters($page){ @@ -36,7 +36,7 @@ class wiby{ ]; } - private function get($url, $get = [], $nsfw){ + private function get($proxy, $url, $get = [], $nsfw){ $curlproc = curl_init(); @@ -45,11 +45,13 @@ class wiby{ $url .= "?" . $get; } + print_r([$proxy, $url]); + curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -69,6 +71,8 @@ class wiby{ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + $this->backend->assign_proxy($curlproc, $proxy); + $data = curl_exec($curlproc); if(curl_errno($curlproc)){ @@ -84,11 +88,8 @@ class wiby{ if($get["npt"]){ - $q = - json_decode( - $this->nextpage->get($get["npt"], "web"), - true - ); + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + $q = json_decode($q, true); $nsfw = $q["nsfw"]; unset($q["nsfw"]); @@ -100,6 +101,7 @@ class wiby{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $date = $get["date"]; $nsfw = $get["nsfw"] == "yes" ? "0" : "1"; @@ -150,6 +152,7 @@ class wiby{ try{ $html = $this->get( + $proxy, "https://wiby.me/", $q, $nsfw @@ -171,13 +174,14 @@ class wiby{ }else{ $nextpage = - $this->nextpage->store( + $this->backend->store( json_encode([ "q" => $q["q"], "p" => (int)$nextpage[1], "nsfw" => $nsfw ]), - "web" + "web", + $proxy ); } diff --git a/scraper/yandex.php b/scraper/yandex.php index 65abe73..7335edc 100644 --- a/scraper/yandex.php +++ b/scraper/yandex.php @@ -10,11 +10,11 @@ class yandex{ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("yandex"); + include "lib/backend.php"; + // backend included in the scraper functions } - private function get($url, $get = [], $nsfw){ + private function get($proxy, $url, $get = [], $nsfw){ $curlproc = curl_init(); @@ -32,7 +32,7 @@ class yandex{ } $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -54,6 +54,8 @@ class yandex{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -207,6 +209,8 @@ class yandex{ public function web($get){ + $this->backend = new backend("yandex_w"); + // has captcha // https://yandex.com/search/touch/?text=lol&app_platform=android&appsearch_header=1&ui=webmobileapp.yandex&app_version=23070603&app_id=ru.yandex.searchplugin&search_source=yandexcom_touch_native&clid=2218567 @@ -215,10 +219,11 @@ class yandex{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "web"); + [$npt, $proxy] = $this->backend->get($get["npt"], "web"); $html = $this->get( + $proxy, "https://yandex.com" . $npt, [], "yes" @@ -226,6 +231,12 @@ class yandex{ }else{ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $lang = $get["lang"]; $older = $get["older"]; $newer = $get["newer"]; @@ -269,6 +280,7 @@ class yandex{ try{ $html = $this->get( + $proxy, "https://yandex.com/search/site/", $params, "yes" @@ -313,7 +325,7 @@ class yandex{ if(count($npt) !== 0){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( $this->fuckhtml ->getTextContent( $npt @@ -321,7 +333,8 @@ class yandex{ ["attributes"] ["href"] ), - "web" + "web", + $proxy ); } @@ -386,17 +399,18 @@ class yandex{ public function image($get){ + $this->backend = new backend("yandex_i"); + if($get["npt"]){ - $request = - json_decode( - $this->nextpage->get( - $get["npt"], - "images" - ), - true + [$request, $proxy] = + $this->backend->get( + $get["npt"], + "images" ); + $request = json_decode($request, true); + $nsfw = $request["nsfw"]; unset($request["nsfw"]); }else{ @@ -407,6 +421,7 @@ class yandex{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $nsfw = $get["nsfw"]; $time = $get["time"]; $size = $get["size"]; @@ -611,9 +626,11 @@ class yandex{ try{ $json = $this->get( + $proxy, "https://yandex.com/images/search", $request, - $nsfw + $nsfw, + "yandex_i" ); }catch(Exception $err){ @@ -676,7 +693,12 @@ class yandex{ $request["p"] = 1; } - $out["npt"] = $this->nextpage->store(json_encode($request), "images"); + $out["npt"] = + $this->backend->store( + json_encode($request), + "images", + $proxy + ); } // get search results @@ -744,21 +766,29 @@ class yandex{ public function video($get){ + $this->backend = new backend("yandex_v"); + if($get["npt"]){ - $params = - json_decode( - $this->nextpage->get( - $get["npt"], - "web" - ), - true + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "video" ); + $params = json_decode($params, true); + $nsfw = $params["nsfw"]; unset($params["nsfw"]); }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $nsfw = $get["nsfw"]; $time = $get["time"]; $duration = $get["duration"]; @@ -865,9 +895,11 @@ class yandex{ try{ $json = $this->get( + $proxy, "https://yandex.com/video/search", $params, - $nsfw + $nsfw, + "yandex_v" ); }catch(Exception $error){ @@ -926,9 +958,10 @@ class yandex{ $params["p"] = "1"; $params["nsfw"] = $nsfw; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "web" + "video", + $proxy ); } diff --git a/scraper/yep.php b/scraper/yep.php index 8ff4a57..7a73635 100644 --- a/scraper/yep.php +++ b/scraper/yep.php @@ -4,8 +4,8 @@ class yep{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("yep"); + include "lib/backend.php"; + $this->backend = new backend("yep"); } public function getfilters($page){ @@ -238,7 +238,7 @@ class yep{ ]; } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); @@ -251,7 +251,7 @@ class yep{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -269,6 +269,8 @@ class yep{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -284,6 +286,11 @@ class yep{ public function image($get){ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + $country = $get["country"]; $nsfw = $get["nsfw"]; @@ -305,6 +312,7 @@ class yep{ $json = json_decode( $this->get( + $this->backend->get_ip(), // no nextpage! "https://api.yep.com/fs/2/search", [ "client" => "web", diff --git a/scraper/youtube.php b/scraper/youtube.php index 83a68ba..526b026 100644 --- a/scraper/youtube.php +++ b/scraper/youtube.php @@ -8,8 +8,8 @@ class youtube{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("yt"); + include "lib/backend.php"; + $this->backend = new backend("yt"); } public function getfilters($page){ @@ -340,7 +340,7 @@ class youtube{ const req_web = 0; const req_xhr = 1; - private function get($url, $get = [], $reqtype = self::req_web, $continuation = null){ + private function get($proxy, $url, $get = [], $reqtype = self::req_web, $continuation = null){ $curlproc = curl_init(); @@ -354,7 +354,7 @@ class youtube{ switch($reqtype){ case self::req_web: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -370,7 +370,7 @@ class youtube{ case self::req_xhr: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -397,6 +397,8 @@ class youtube{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -430,17 +432,17 @@ class youtube{ $json = fread($handle, filesize("nextpage.json")); fclose($handle);*/ - $npt = - json_decode( - $this->nextpage->get( - $get["npt"], - "videos" - ), - true + [$npt, $proxy] = + $this->backend->get( + $get["npt"], + "videos" ); + $npt = json_decode($npt, true); + try{ $json = $this->get( + $proxy, "https://www.youtube.com/youtubei/v1/search", [ "key" => $npt["key"], @@ -507,6 +509,7 @@ class youtube{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $date = $get["date"]; $type = $get["type"]; $duration = $get["duration"]; @@ -537,6 +540,7 @@ class youtube{ try{ $json = $this->get( + $proxy, "https://www.youtube.com/results", $get ); @@ -942,7 +946,14 @@ class youtube{ if($this->out["npt"] !== null){ - $this->out["npt"] = $this->nextpage->store(json_encode($this->out["npt"]), "videos"); + $this->out["npt"] = + $this->backend->store( + json_encode( + $this->out["npt"] + ), + "videos", + $proxy + ); } return $this->out; diff --git a/settings.php b/settings.php index 41322d6..bee31ea 100644 --- a/settings.php +++ b/settings.php @@ -1,5 +1,7 @@ "Theme", "parameter" => "theme", - "options" => [ - [ - "value" => "dark", - "text" => "Gruvbox dark" - ], - [ - "value" => "cream", - "text" => "Gruvbox cream" - ] - ] + "options" => [] ], [ "description" => "Prevent clicking background elements when image viewer is open", @@ -59,7 +52,7 @@ $settings = [ "name" => "Scrapers to use", "settings" => [ [ - "description" => "Autocomplete
Picking
Auto
changes the source dynamically depending of the page's scraper
Picking
Disabled
disables this feature
", + "description" => "Autocomplete
Picking Auto changes the source dynamically depending of the page's scraper
Warning: If you edit this field, you will need to re-add the search engine so that the new autocomplete settings are applied!
", "parameter" => "scraper_ac", "options" => [ [ @@ -242,6 +235,26 @@ $settings = [ ] ]; +/* + Set theme collection +*/ +$themes = glob("static/themes/*"); + +$settings[0]["settings"][1]["options"][] = [ + "value" => "Dark", + "text" => "Dark" +]; + +foreach($themes as $theme){ + + $theme = explode(".", basename($theme))[0]; + + $settings[0]["settings"][1]["options"][] = [ + "value" => $theme, + "text" => $theme + ]; +} + /* Set cookies */ @@ -262,28 +275,48 @@ if($_POST){ foreach($loop as $key => $value){ - foreach($settings as $title){ + if($key == "theme"){ - foreach($title["settings"] as $list){ + if($value == config::DEFAULT_THEME){ - if( - $list["parameter"] == $key && - $list["options"][0]["value"] == $value - ){ + unset($_COOKIE[$key]); + + setcookie( + "theme", + "", + [ + "expires" => -1, // removes cookie + "samesite" => "Lax", + "path" => "/" + ] + ); + continue; + } + }else{ + + foreach($settings as $title){ + + foreach($title["settings"] as $list){ - unset($_COOKIE[$key]); - - setcookie( - $key, - "", - [ - "expires" => -1, // removes cookie - "samesite" => "Lax", - "path" => "/" - ] - ); - - continue 3; + if( + $list["parameter"] == $key && + $list["options"][0]["value"] == $value + ){ + + unset($_COOKIE[$key]); + + setcookie( + $key, + "", + [ + "expires" => -1, // removes cookie + "samesite" => "Lax", + "path" => "/" + ] + ); + + continue 3; + } } } } @@ -313,19 +346,13 @@ include "lib/frontend.php"; $frontend = new frontend(); echo - '' . - '' . - '' . - '' . - 'Settings' . - '' . - '' . - '' . - '' . - '' . - '' . - '' . - 'getthemeclass() . '>'; + $frontend->load( + "header_nofilters.html", + [ + "title" => "Settings", + "class" => "" + ] + ); $left = '

Settings

' . @@ -376,6 +403,14 @@ foreach($settings as $title){ '
' . $setting["description"] . '
' . ' diff --git a/template/header_nofilters.html b/template/header_nofilters.html new file mode 100644 index 0000000..116eef6 --- /dev/null +++ b/template/header_nofilters.html @@ -0,0 +1,14 @@ + + + + + {%title%} + + + {%style%} + + + + + + diff --git a/template/home.html b/template/home.html index 9818677..b4f0735 100644 --- a/template/home.html +++ b/template/home.html @@ -2,15 +2,17 @@ - 4get + {%server_name%} + + - + + {%style%} - - + - +
- SettingsAPIAboutSourceDonate + SettingsInstancesAPIAboutSourceDonate
- Clearnet: 4get.ca
- Tor: 4getwebfrq5zr4sxugk6htxvawqehxtdgjrbcn2oslllcol2vepa23yd.onion
- Report a problem: lolcat.ca + ClearnetTorReport a problem
+ Running on v{%version%}!!
- + diff --git a/template/images.html b/template/images.html index 1c5b23a..a19ddeb 100644 --- a/template/images.html +++ b/template/images.html @@ -2,6 +2,6 @@ {%images%} {%nextpage%} - + diff --git a/template/instances.html b/template/instances.html new file mode 100644 index 0000000..829e638 --- /dev/null +++ b/template/instances.html @@ -0,0 +1,36 @@ + + + + + Instance browser + + + {%style%} + + + + + + +

Instance browser

+ Learn how to setup your own instance here! https://git.lolcat.ca/lolcat/4get + + + + + + diff --git a/template/search.html b/template/search.html index 35da30d..d7f73a5 100644 --- a/template/search.html +++ b/template/search.html @@ -11,6 +11,6 @@ {%left%} - + diff --git a/videos.php b/videos.php index ddc281c..cf48aac 100644 --- a/videos.php +++ b/videos.php @@ -3,6 +3,8 @@ /* Initialize random shit */ +include "data/config.php"; + include "lib/frontend.php"; $frontend = new frontend(); @@ -28,20 +30,7 @@ try{ }catch(Exception $error){ - echo - $frontend->drawerror( - "Shit", - 'This scraper returned an error:' . - '
' . htmlspecialchars($error->getMessage()) . '
' . - 'Things you can try:' . - '
    ' . - '
  • Use a different scraper
  • ' . - '
  • Remove keywords that could cause errors
  • ' . - '
  • Use another 4get instance
  • ' . - '

' . - 'If the error persists, please contact the administrator.' - ); - die(); + $frontend->drawscrapererror($error->getMessage(), $get, "videos"); } $categories = [ diff --git a/web.php b/web.php index 05700b2..97905be 100644 --- a/web.php +++ b/web.php @@ -3,6 +3,8 @@ /* Initialize random shit */ +include "data/config.php"; + include "lib/frontend.php"; $frontend = new frontend(); @@ -28,20 +30,7 @@ try{ }catch(Exception $error){ - echo - $frontend->drawerror( - "Shit", - 'This scraper returned an error:' . - '
' . htmlspecialchars($error->getMessage()) . '
' . - 'Things you can try:' . - '
    ' . - '
  • Use a different scraper
  • ' . - '
  • Remove keywords that could cause errors
  • ' . - '
  • Use another 4get instance
  • ' . - '

' . - 'If the error persists, please contact the administrator.' - ); - die(); + $frontend->drawscrapererror($error->getMessage(), $get, "web"); } /*