forked from lolcat/4get
Compare commits
24 Commits
Author | SHA1 | Date |
---|---|---|
lolcat | 9ca93f34c6 | |
lolcat | 0a43b9c849 | |
lolcat | b636fec319 | |
lolcat | 774f7113df | |
lolcat | 0b3bbe0f15 | |
lolcat | 5f0b0a7b83 | |
lolcat | 920b9d5b3f | |
lolcat | 9cd369ac08 | |
lolcat | e83865be49 | |
lolcat | 68dd7f29f6 | |
lolcat | aaa30c79f5 | |
lolcat | 070f9d442b | |
lolcat | 9c18753ec3 | |
lolcat | d8a729796e | |
lolcat | 2bbe5a29a9 | |
lolcat | 9ac195ac3b | |
lolcat | d427a48ed4 | |
lolcat | 12d5b4ade8 | |
Pano | c422abbdc6 | |
Pano | 85246cc7ec | |
Pano | d709d12111 | |
Pano | 19f82a8536 | |
Pano | 155a38d454 | |
Pano | 6926e374af |
19
api.txt
19
api.txt
|
@ -1,9 +1,16 @@
|
||||||
__ __ __
|
44
|
||||||
/ // / ____ ____ / /_
|
4444444 44
|
||||||
/ // /_/ __ `/ _ \/ __/
|
44444444 44444 444
|
||||||
/__ __/ /_/ / __/ /_
|
44444444 444444 444444444
|
||||||
/_/ \__, /\___/\__/
|
44444 44444444 444444444
|
||||||
/____/
|
444444444 4444444
|
||||||
|
4444444444 444444
|
||||||
|
4444444444444
|
||||||
|
444444444444444444
|
||||||
|
444444444444444
|
||||||
|
44444444
|
||||||
|
4444
|
||||||
|
44
|
||||||
|
|
||||||
+ Welcome to the 4get API documentation +
|
+ Welcome to the 4get API documentation +
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,7 @@ class config{
|
||||||
|
|
||||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||||
// Changing this might break things.
|
// Changing this might break things.
|
||||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0";
|
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||||
|
|
||||||
// Proxy pool assignments for each scraper
|
// Proxy pool assignments for each scraper
|
||||||
// false = Use server's raw IP
|
// false = Use server's raw IP
|
||||||
|
@ -129,6 +129,7 @@ class config{
|
||||||
const PROXY_BRAVE = false;
|
const PROXY_BRAVE = false;
|
||||||
const PROXY_FB = false; // facebook
|
const PROXY_FB = false; // facebook
|
||||||
const PROXY_GOOGLE = false;
|
const PROXY_GOOGLE = false;
|
||||||
|
const PROXY_GOOGLE_CSE = false;
|
||||||
const PROXY_STARTPAGE = false;
|
const PROXY_STARTPAGE = false;
|
||||||
const PROXY_QWANT = false;
|
const PROXY_QWANT = false;
|
||||||
const PROXY_GHOSTERY = false;
|
const PROXY_GHOSTERY = false;
|
||||||
|
@ -157,6 +158,9 @@ class config{
|
||||||
// Scraper-specific parameters
|
// Scraper-specific parameters
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// GOOGLE CSE
|
||||||
|
const GOOGLE_CX_ENDPOINT = "d4e68b99b876541f0";
|
||||||
|
|
||||||
// MARGINALIA
|
// MARGINALIA
|
||||||
// Use "null" to default out to HTML scraping OR specify a string to
|
// Use "null" to default out to HTML scraping OR specify a string to
|
||||||
// use the API (Eg: "public"). API has less filters.
|
// use the API (Eg: "public"). API has less filters.
|
||||||
|
|
185
docs/nginx.md
185
docs/nginx.md
|
@ -1,27 +1,67 @@
|
||||||
# Install on NGINX
|
<h1 align=center>Installation of 4get in NGINX</h1>
|
||||||
|
|
||||||
>I do NOT recommend following this guide, only follow this if you *really* need to use nginx. I recommend you use the apache2 steps instead.
|
<div align=right>
|
||||||
|
|
||||||
Login as root.
|
> NOTE: As the previous version stated, it is better to follow the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">Apache2 guide</a> instead of the Nginx one.
|
||||||
|
|
||||||
Create a file in `/etc/nginx/sites-avaliable/` called `4get.conf` or any name you want and put this into the file:
|
> NOTE: This is going to guess that you're using either a <abbr title="(Arch Linux, Artix Linux, Endeavouros, etc...) ">Arch-based system</abbr> or a <abbr title="(Debian, Ubuntu, Devuan, etc...)">Debian-based system</abbr>, although you can still follow it with minor issues.
|
||||||
|
|
||||||
```
|
</div>
|
||||||
server {
|
|
||||||
# DO YOU REALLY NEED TO LOG SEARCHES?
|
1. Login as root.
|
||||||
access_log /dev/null;
|
2. Upgrade your system:
|
||||||
error_log /dev/null;
|
* On Arch-based, run `pacman -Syu`.
|
||||||
# Change this if you have 4get in other folder.
|
* On Debian-based, run `apt update`, then `apt upgrade`.
|
||||||
|
3. Install the following dependencies:
|
||||||
|
* `git`: So you can clone <a href="https://git.lolcat.ca/lolcat/4get">this</a> repository.
|
||||||
|
* `nginx`: So you can run Nginx.
|
||||||
|
* `php-fpm`: This is what allows Nginx to run *(and show)* PHP files.
|
||||||
|
* `php-imagick`, `imagemagick`: Image manipulation.
|
||||||
|
* `php-apcu`: Caching module.
|
||||||
|
* `php-curl`, `curl`: Transferring data with URLs.
|
||||||
|
* `php-mbstring`: String utils.
|
||||||
|
* `certbot`, `certbot-nginx`: ACME client. Used to create SSL certificates.
|
||||||
|
* In Arch-based distributions:
|
||||||
|
* `pacman -S nginx certbot php-imagick certbot-nginx imagemagick curl php-apcu git`
|
||||||
|
* In Debian-based distributions:
|
||||||
|
* `apt install php-mbstring nginx certbot-nginx certbot php-imagick imagemagick php-curl curl php-apcu git`
|
||||||
|
|
||||||
|
<div align=right>
|
||||||
|
|
||||||
|
> IMPORTANT: `php-curl`, `php-mbstring` might be a Debian-only package, but this needs further fact checking.
|
||||||
|
|
||||||
|
> IMPORTANT: If having issues with `php-apcu` or `libsodium`, go to [^1].
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
4. `cd` to `/etc/nginx` and make the `conf.d/` directory if it doesn't exist:
|
||||||
|
* Again, this guesses you're logged in as root.
|
||||||
|
```sh
|
||||||
|
cd /etc/nginx
|
||||||
|
ls -l conf.d/ # If ls shows conf.d, then it means it exists.
|
||||||
|
# If it does not, run:
|
||||||
|
mkdir conf.d
|
||||||
|
```
|
||||||
|
5. Make a file inside `conf.d/` called `4get.conf` and place the following content:
|
||||||
|
* First run `touch conf.d/4get.conf` then `nano conf.d/4get.conf` to open the nano editor: *(Install it if it is not, or use another editor.)*
|
||||||
|
```sh
|
||||||
|
server {
|
||||||
|
access_log /dev/null; # Search log file. Do you really need to?
|
||||||
|
error_log /dev/null; # Error log file.
|
||||||
|
|
||||||
|
# Change this if you have 4get in another folder.
|
||||||
root /var/www/4get;
|
root /var/www/4get;
|
||||||
# Change yourdomain by your domain lol
|
# Change 'yourdomain' to your domain.
|
||||||
server_name www.yourdomain.com yourdomain.com;
|
server_name www.yourdomain.com yourdomain.com;
|
||||||
|
# Port to listen to.
|
||||||
|
listen 80;
|
||||||
|
|
||||||
location @php {
|
location @php {
|
||||||
try_files $uri.php $uri/index.php =404;
|
try_files $uri.php $uri/index.php =404;
|
||||||
# Change the unix socket address if it's different for you.
|
# Change the unix socket address if it's different for you.
|
||||||
fastcgi_pass unix:/var/run/php-fpm/php-fpm.sock;
|
fastcgi_pass unix:/var/run/php-fpm/php-fpm.sock;
|
||||||
fastcgi_index index.php;
|
fastcgi_index index.php;
|
||||||
# Change this to `fastcgi_params` if you use a debian based distro.
|
# Change this to `fastcgi_params` if you use a debian based distribution.
|
||||||
include fastcgi.conf;
|
include fastcgi.conf;
|
||||||
fastcgi_intercept_errors on;
|
fastcgi_intercept_errors on;
|
||||||
}
|
}
|
||||||
|
@ -34,56 +74,96 @@ server {
|
||||||
return 301 $1;
|
return 301 $1;
|
||||||
}
|
}
|
||||||
|
|
||||||
listen 80;
|
}
|
||||||
}
|
```
|
||||||
```
|
* The above is a very basic configuration and thus will need tweaking to your personal needs. It should still work as-is, though. A 'real world' example is present in [^2].
|
||||||
|
* After saving the file, check that the `nginx.conf` file inside the main directory includes files inside `conf.d/`:
|
||||||
|
* It should be inside the the http block: *(The following is an example! Don't just Copy and Paste it!)*
|
||||||
|
```sh
|
||||||
|
http {
|
||||||
|
include mime.types;
|
||||||
|
include conf.d/*.conf;
|
||||||
|
types_hash_max_size 4096;
|
||||||
|
# ...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
* Now, test your configuration with `nginx -t`, if it says that everything is good, restart *(or start)* the Nginx daemon:
|
||||||
|
* This depends on the init manager, most distributions use `systemd`, but it's better practice to include most.
|
||||||
|
```sh
|
||||||
|
# systemd
|
||||||
|
systemctl stop nginx
|
||||||
|
systemctl start nginxt
|
||||||
|
# or
|
||||||
|
systemctl restart nginx
|
||||||
|
|
||||||
That is a very basic config so you will need to adapt it to your needs in case you have a more complicated nginx configuration. Anyways, you can see a real world example [here](https://git.zzls.xyz/Fijxu/etc-configs/src/branch/selfhost/nginx/sites-available/4get.zzls.xyz.conf)
|
# openrc
|
||||||
|
rc-service nginx stop
|
||||||
|
rc-service nginx start
|
||||||
|
# or
|
||||||
|
rc-service nginx restart
|
||||||
|
|
||||||
After you save the file you will need to do a symlink of the `4get.conf` file to `/etc/nignx/sites-enabled/`, you can do it with this command:
|
# runit
|
||||||
|
sv down nginx
|
||||||
|
sv up nginx
|
||||||
|
# or
|
||||||
|
sv restart nginx
|
||||||
|
|
||||||
```sh
|
# s6
|
||||||
ln -s /etc/nginx/sites-available/4get.conf /etc/nginx/sites-available/4get.conf
|
s6-rc -d change nginx
|
||||||
```
|
s6-rc -u change nginx
|
||||||
|
# or
|
||||||
|
s6-svc -r /run/service/nginx
|
||||||
|
|
||||||
Now test the nginx config with `nginx -t`, if it says that everything is good, restart nginx using `systemctl restart nginx`
|
# dinit
|
||||||
|
dinitctl stop nginx
|
||||||
|
dinitctl start nginx
|
||||||
|
# or
|
||||||
|
dinitctl restart nginx
|
||||||
|
```
|
||||||
|
6. Clone the repository to `/var/www`:
|
||||||
|
* `git clone --depth 1 https://git.lolcat.ca/lolcat/4get 4get` - It clones the repository with the depth of one commit *(so it takes less time to download)* and saves the cloned repository as '4get'.
|
||||||
|
7. That should be it! There are some extra steps you can take, but it really just depends on you.
|
||||||
|
|
||||||
# Encryption setup
|
<h2 align=center>Encryption setup</h2>
|
||||||
|
|
||||||
Generate a certificate for the domain using:
|
1. Generate a certificate for the domain you're using with:
|
||||||
|
* Note that `certbot-nginx` is needed.
|
||||||
|
```sh
|
||||||
|
certbot --nginx --key-type ecdsa -d www.yourdomain.com -d yourdomain.com
|
||||||
|
```
|
||||||
|
2. After that, certbot will deploy the certificate automatically to your 4get conf file; It should be ready to use from there.
|
||||||
|
|
||||||
```sh
|
<h2 align=center>Tor Setup</h2>
|
||||||
certbot --nginx --key-type ecdsa -d www.yourdomain.com -d yourdomain.com
|
|
||||||
```
|
|
||||||
(Remember to install the nginx certbot plugin!!!)
|
|
||||||
|
|
||||||
After doing that certbot should deploy the certificate automatically into your 4get nginx config file. It should be ready to use at that point.
|
<div align=right>
|
||||||
|
|
||||||
# Tor setup on NGINX
|
> IMPORTANT: Tor onion addresses are very long compared to traditional domains, so, Before doing anything, edit `nginx.conf` and increase <abbr title="This setting in your Nginx configuration controls the internal data structure used to manage multiple server names (hostnames) associated with your web server. Each hostname requires a certain amount of memory within this structure. If the size is insufficient, Nginx will encounter errors."><code>server_names_hash_bucket_size</code></abbr> to your needs.
|
||||||
|
|
||||||
Important Note: Tor onion addresses are significantly longer than traditional domain names. Before proceeding with Nginx configuration, ensure you increase the `server_names_hash_bucket_size` value in your `nginx.conf` file. This setting in your Nginx configuration controls the internal data structure used to manage multiple server names (hostnames) associated with your web server. Each hostname requires a certain amount of memory within this structure. If the size is insufficient, Nginx will encounter errors.
|
</div>
|
||||||
|
|
||||||
1. Open your `nginx.conf` file (that is under `/etc/nginx/nginx.conf`).
|
1. `cd` to `/etc/nginx` *(if you haven't)* and open your `nginx.conf` file.
|
||||||
2. Find the line containing `# server_names_hash_bucket_size 64;`.
|
2. Find the line containing `# server_names_hash_bucket_size 64;` inside said file.
|
||||||
3. Uncomment the line and adjust the value. Start with 64, but if you encounter issues, incrementally increase it (e.g., 128, 256) until it accommodates your configuration.
|
3. Uncomment the line and adjust the value; start with 64, but if you encounter issues, incrementally increase it *(e.g., 128, 256)* until it accommodates your configuration.
|
||||||
|
4. Open *(or duplicate the configuration)* and edit it:
|
||||||
|
* Example configuration, again:
|
||||||
|
```sh
|
||||||
|
server {
|
||||||
|
access_log /dev/null; # Search log file. Do you really need to?
|
||||||
|
error_log /dev/null; # Error log file.
|
||||||
|
|
||||||
Open your current 4get NGINX config (that is under `/etc/nginx/sites-available/`) and append this to the end of the file:
|
# Change this if you have 4get in another folder.
|
||||||
|
|
||||||
```
|
|
||||||
server {
|
|
||||||
access_log /dev/null;
|
|
||||||
error_log /dev/null;
|
|
||||||
|
|
||||||
listen 80;
|
|
||||||
server_name <youronionaddress>;
|
|
||||||
root /var/www/4get;
|
root /var/www/4get;
|
||||||
|
# Change 'onionadress.onion' to your onion link.
|
||||||
|
server_name onionadress.onion;
|
||||||
|
# Port to listen to.
|
||||||
|
listen 80;
|
||||||
|
|
||||||
location @php {
|
location @php {
|
||||||
try_files $uri.php $uri/index.php =404;
|
try_files $uri.php $uri/index.php =404;
|
||||||
# Change the unix socket address if it's different for you.
|
# Change the unix socket address if it's different for you.
|
||||||
fastcgi_pass unix:/var/run/php-fpm/php-fpm.sock;
|
fastcgi_pass unix:/var/run/php-fpm/php-fpm.sock;
|
||||||
fastcgi_index index.php;
|
fastcgi_index index.php;
|
||||||
# Change this to `fastcgi_params` if you use a debian based distro.
|
# Change this to `fastcgi_params` if you use a debian based distribution.
|
||||||
include fastcgi.conf;
|
include fastcgi.conf;
|
||||||
fastcgi_intercept_errors on;
|
fastcgi_intercept_errors on;
|
||||||
}
|
}
|
||||||
|
@ -95,9 +175,20 @@ server {
|
||||||
location ~* ^(.*)\.php$ {
|
location ~* ^(.*)\.php$ {
|
||||||
return 301 $1;
|
return 301 $1;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Obviously replace `<youronionaddress>` by the onion address of `/var/lib/tor/4get/hostname` and then check if the nginx config is valid with `nginx -t` if yes, then restart the nginx service and try opening the onion address into the Tor Browser. You can see a real world example [here](https://git.zzls.xyz/Fijxu/etc-configs/src/branch/selfhost/nginx/sites-available/4get.zzls.xyz.conf)
|
}
|
||||||
|
```
|
||||||
|
A real world example is present in [^2].
|
||||||
|
5. Once done, check the configuration with `nginx -t`. If everything's fine and dandy, refer to <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/tor.md">the Tor guide</a> to setup your onion site.
|
||||||
|
|
||||||
Once you did the above, refer to <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/tor.md">this tor guide</a> to setup your onionsite.
|
<h2 align=center>Other important things</h2>
|
||||||
|
|
||||||
|
1. <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/configure.md">Configuration guide</a>: Things to do after setup.
|
||||||
|
2. <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">Apache2 guide</a>: Fallback to this if you couldn't get something to work, or you don't know something.
|
||||||
|
|
||||||
|
<h2 align=center>Known issues</h2>
|
||||||
|
|
||||||
|
1. https://git.lolcat.ca/lolcat/4get/issues
|
||||||
|
|
||||||
|
[^1]: lolcat/4get#40, If having issues with `libsodium`, or `php-apcu`.
|
||||||
|
[^2]: <a href="https://git.nadeko.net/Fijxu/etc-configs/src/branch/selfhost/nginx/conf.d/4get.conf">git.nadeko.net</a> nadeko.net's 4get instance configuration.
|
|
@ -75,6 +75,7 @@ class backend{
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "socks5_hostname":
|
case "socks5_hostname":
|
||||||
|
case "socks5h":
|
||||||
case "socks5a":
|
case "socks5a":
|
||||||
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
|
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
|
||||||
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);
|
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);
|
||||||
|
|
|
@ -838,10 +838,10 @@ class frontend{
|
||||||
}
|
}
|
||||||
|
|
||||||
$payload .=
|
$payload .=
|
||||||
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
|
|
||||||
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
|
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
|
||||||
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
|
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
|
||||||
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
|
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
|
||||||
|
'<a href="https://arquivo.pt/wayback/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://arquivo.pt" alt="ar">Arquivo.pt</a>' .
|
||||||
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
|
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
|
||||||
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .
|
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .
|
||||||
'</div>';
|
'</div>';
|
||||||
|
@ -939,6 +939,7 @@ class frontend{
|
||||||
"brave" => "Brave",
|
"brave" => "Brave",
|
||||||
"yandex" => "Yandex",
|
"yandex" => "Yandex",
|
||||||
"google" => "Google",
|
"google" => "Google",
|
||||||
|
"google_cse" => "Google CSE",
|
||||||
"startpage" => "Startpage",
|
"startpage" => "Startpage",
|
||||||
"qwant" => "Qwant",
|
"qwant" => "Qwant",
|
||||||
"ghostery" => "Ghostery",
|
"ghostery" => "Ghostery",
|
||||||
|
@ -963,6 +964,7 @@ class frontend{
|
||||||
"yandex" => "Yandex",
|
"yandex" => "Yandex",
|
||||||
"brave" => "Brave",
|
"brave" => "Brave",
|
||||||
"google" => "Google",
|
"google" => "Google",
|
||||||
|
"google_cse" => "Google CSE",
|
||||||
"startpage" => "Startpage",
|
"startpage" => "Startpage",
|
||||||
"qwant" => "Qwant",
|
"qwant" => "Qwant",
|
||||||
"yep" => "Yep",
|
"yep" => "Yep",
|
||||||
|
|
109
lib/fuckhtml.php
109
lib/fuckhtml.php
|
@ -381,6 +381,8 @@ class fuckhtml{
|
||||||
$json_out = null;
|
$json_out = null;
|
||||||
$last_char = null;
|
$last_char = null;
|
||||||
|
|
||||||
|
$keyword_check = null;
|
||||||
|
|
||||||
for($i=0; $i<strlen($json); $i++){
|
for($i=0; $i<strlen($json); $i++){
|
||||||
|
|
||||||
switch($json[$i]){
|
switch($json[$i]){
|
||||||
|
@ -396,6 +398,7 @@ class fuckhtml{
|
||||||
|
|
||||||
$bracket = false;
|
$bracket = false;
|
||||||
$is_close_bracket = true;
|
$is_close_bracket = true;
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
|
|
||||||
if($bracket === false){
|
if($bracket === false){
|
||||||
|
@ -429,6 +432,31 @@ class fuckhtml{
|
||||||
$is_close_bracket === false
|
$is_close_bracket === false
|
||||||
){
|
){
|
||||||
|
|
||||||
|
// do keyword check
|
||||||
|
$keyword_check .= $json[$i];
|
||||||
|
|
||||||
|
if(in_array($json[$i], [":", "{"])){
|
||||||
|
|
||||||
|
$keyword_check = substr($keyword_check, 0, -1);
|
||||||
|
|
||||||
|
if(
|
||||||
|
preg_match(
|
||||||
|
'/function|array|return/i',
|
||||||
|
$keyword_check
|
||||||
|
)
|
||||||
|
){
|
||||||
|
|
||||||
|
$json_out =
|
||||||
|
preg_replace(
|
||||||
|
'/[{"]*' . preg_quote($keyword_check, "/") . '$/',
|
||||||
|
"",
|
||||||
|
$json_out
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
$keyword_check = null;
|
||||||
|
}
|
||||||
|
|
||||||
// here we know we're not iterating over a quoted string
|
// here we know we're not iterating over a quoted string
|
||||||
switch($json[$i]){
|
switch($json[$i]){
|
||||||
|
|
||||||
|
@ -498,4 +526,85 @@ class fuckhtml{
|
||||||
$string
|
$string
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function extract_json($json){
|
||||||
|
|
||||||
|
$len = strlen($json);
|
||||||
|
$array_level = 0;
|
||||||
|
$object_level = 0;
|
||||||
|
$in_quote = null;
|
||||||
|
$start = null;
|
||||||
|
|
||||||
|
for($i=0; $i<$len; $i++){
|
||||||
|
|
||||||
|
switch($json[$i]){
|
||||||
|
|
||||||
|
case "[":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$array_level++;
|
||||||
|
if($start === null){
|
||||||
|
|
||||||
|
$start = $i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "]":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$array_level--;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "{":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$object_level++;
|
||||||
|
if($start === null){
|
||||||
|
|
||||||
|
$start = $i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "}":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$object_level--;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "\"":
|
||||||
|
case "'":
|
||||||
|
if(
|
||||||
|
$i !== 0 &&
|
||||||
|
$json[$i - 1] !== "\\"
|
||||||
|
){
|
||||||
|
// found a non-escaped quote
|
||||||
|
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
// open quote
|
||||||
|
$in_quote = $json[$i];
|
||||||
|
}elseif($in_quote === $json[$i]){
|
||||||
|
|
||||||
|
// close quote
|
||||||
|
$in_quote = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(
|
||||||
|
$start !== null &&
|
||||||
|
$array_level === 0 &&
|
||||||
|
$object_level === 0
|
||||||
|
){
|
||||||
|
|
||||||
|
return substr($json, $start, $i - $start + 1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -293,8 +293,8 @@ class brave{
|
||||||
/*
|
/*
|
||||||
$handle = fopen("scraper/brave.html", "r");
|
$handle = fopen("scraper/brave.html", "r");
|
||||||
$html = fread($handle, filesize("scraper/brave.html"));
|
$html = fread($handle, filesize("scraper/brave.html"));
|
||||||
fclose($handle);
|
fclose($handle);*/
|
||||||
*/
|
|
||||||
|
|
||||||
try{
|
try{
|
||||||
$html =
|
$html =
|
||||||
|
@ -410,10 +410,20 @@ class brave{
|
||||||
throw new Exception("Could not grep JavaScript object");
|
throw new Exception("Could not grep JavaScript object");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$data =
|
||||||
|
rtrim(
|
||||||
|
preg_replace(
|
||||||
|
'/\(Array\(0\)\)\).*$/',
|
||||||
|
"",
|
||||||
|
$grep[1]
|
||||||
|
),
|
||||||
|
" ]"
|
||||||
|
) . "]";
|
||||||
|
|
||||||
$data =
|
$data =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->parseJsObject(
|
->parseJsObject(
|
||||||
$grep[1]
|
$data
|
||||||
);
|
);
|
||||||
unset($grep);
|
unset($grep);
|
||||||
|
|
||||||
|
@ -663,7 +673,10 @@ class brave{
|
||||||
$table["Address"] = $result["location"]["postal_address"]["displayAddress"];
|
$table["Address"] = $result["location"]["postal_address"]["displayAddress"];
|
||||||
}
|
}
|
||||||
|
|
||||||
if(isset($result["location"]["rating"])){
|
if(
|
||||||
|
isset($result["location"]["rating"]) &&
|
||||||
|
$result["location"]["rating"] != "void 0"
|
||||||
|
){
|
||||||
|
|
||||||
$table["Rating"] =
|
$table["Rating"] =
|
||||||
$result["location"]["rating"]["ratingValue"] . "/" .
|
$result["location"]["rating"]["ratingValue"] . "/" .
|
||||||
|
@ -671,13 +684,19 @@ class brave{
|
||||||
number_format($result["location"]["rating"]["reviewCount"]) . " votes)";
|
number_format($result["location"]["rating"]["reviewCount"]) . " votes)";
|
||||||
}
|
}
|
||||||
|
|
||||||
if(isset($result["location"]["contact"]["telephone"])){
|
if(
|
||||||
|
isset($result["location"]["contact"]["telephone"]) &&
|
||||||
|
$result["location"]["contact"]["telephone"] != "void 0"
|
||||||
|
){
|
||||||
|
|
||||||
$table["Phone number"] =
|
$table["Phone number"] =
|
||||||
$result["location"]["contact"]["telephone"];
|
$result["location"]["contact"]["telephone"];
|
||||||
}
|
}
|
||||||
|
|
||||||
if(isset($result["location"]["price_range"])){
|
if(
|
||||||
|
isset($result["location"]["price_range"]) &&
|
||||||
|
$result["location"]["price_range"] != "void 0"
|
||||||
|
){
|
||||||
|
|
||||||
$table["Price"] =
|
$table["Price"] =
|
||||||
$result["location"]["price_range"];
|
$result["location"]["price_range"];
|
||||||
|
|
3003
scraper/ddg.php
3003
scraper/ddg.php
File diff suppressed because it is too large
Load Diff
|
@ -136,7 +136,7 @@ class ftm{
|
||||||
"source" => [
|
"source" => [
|
||||||
[
|
[
|
||||||
"url" =>
|
"url" =>
|
||||||
"https://findthatmeme.us-southeast-1.linodeobjects.com/" .
|
"https://s3.thehackerblog.com/findthatmeme/" .
|
||||||
$thumb,
|
$thumb,
|
||||||
"width" => null,
|
"width" => null,
|
||||||
"height" => null
|
"height" => null
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -220,6 +220,7 @@ class marginalia{
|
||||||
"related" => []
|
"related" => []
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// API scraper
|
||||||
if(config::MARGINALIA_API_KEY !== null){
|
if(config::MARGINALIA_API_KEY !== null){
|
||||||
|
|
||||||
try{
|
try{
|
||||||
|
@ -263,7 +264,29 @@ class marginalia{
|
||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
|
|
||||||
// no more cloudflare!! Parse html by default
|
// HTML parser
|
||||||
|
$proxy = $this->backend->get_ip();
|
||||||
|
|
||||||
|
if($get["npt"]){
|
||||||
|
|
||||||
|
[$params, $proxy] =
|
||||||
|
$this->backend->get(
|
||||||
|
$get["npt"],
|
||||||
|
"web"
|
||||||
|
);
|
||||||
|
|
||||||
|
try{
|
||||||
|
$html =
|
||||||
|
$this->get(
|
||||||
|
$proxy,
|
||||||
|
"https://search.marginalia.nu/search?" . $params
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
|
throw new Exception("Failed to get HTML");
|
||||||
|
}
|
||||||
|
|
||||||
|
}else{
|
||||||
$params = [
|
$params = [
|
||||||
"query" => $search
|
"query" => $search
|
||||||
];
|
];
|
||||||
|
@ -284,7 +307,7 @@ class marginalia{
|
||||||
try{
|
try{
|
||||||
$html =
|
$html =
|
||||||
$this->get(
|
$this->get(
|
||||||
$this->backend->get_ip(),
|
$proxy,
|
||||||
"https://search.marginalia.nu/search",
|
"https://search.marginalia.nu/search",
|
||||||
$params
|
$params
|
||||||
);
|
);
|
||||||
|
@ -292,6 +315,7 @@ class marginalia{
|
||||||
|
|
||||||
throw new Exception("Failed to get HTML");
|
throw new Exception("Failed to get HTML");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$this->fuckhtml->load($html);
|
$this->fuckhtml->load($html);
|
||||||
|
|
||||||
|
@ -387,6 +411,65 @@ class marginalia{
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get next page
|
||||||
|
$this->fuckhtml->load($html);
|
||||||
|
|
||||||
|
$pagination =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByAttributeValue(
|
||||||
|
"aria-label",
|
||||||
|
"pagination",
|
||||||
|
"nav"
|
||||||
|
);
|
||||||
|
|
||||||
|
if(count($pagination) === 0){
|
||||||
|
|
||||||
|
// no pagination
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->fuckhtml->load($pagination[0]);
|
||||||
|
|
||||||
|
$pages =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"page-link",
|
||||||
|
"a"
|
||||||
|
);
|
||||||
|
|
||||||
|
$found_current_page = false;
|
||||||
|
|
||||||
|
foreach($pages as $page){
|
||||||
|
|
||||||
|
if(
|
||||||
|
stripos(
|
||||||
|
$page["attributes"]["class"],
|
||||||
|
"active"
|
||||||
|
) !== false
|
||||||
|
){
|
||||||
|
|
||||||
|
$found_current_page = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if($found_current_page){
|
||||||
|
|
||||||
|
// we found current page index, and we iterated over
|
||||||
|
// the next page <a>
|
||||||
|
|
||||||
|
$out["npt"] =
|
||||||
|
$this->backend->store(
|
||||||
|
parse_url(
|
||||||
|
$page["attributes"]["href"],
|
||||||
|
PHP_URL_QUERY
|
||||||
|
),
|
||||||
|
"web",
|
||||||
|
$proxy
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -701,9 +701,11 @@ class mojeek{
|
||||||
if(count($thumb) === 2){
|
if(count($thumb) === 2){
|
||||||
|
|
||||||
$answer["thumb"] =
|
$answer["thumb"] =
|
||||||
|
urldecode(
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->getTextContent(
|
->getTextContent(
|
||||||
$thumb[1]
|
$thumb[1]
|
||||||
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -133,6 +133,10 @@ $settings = [
|
||||||
"value" => "google",
|
"value" => "google",
|
||||||
"text" => "Google"
|
"text" => "Google"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "google_cse",
|
||||||
|
"text" => "Google CSE"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "startpage",
|
"value" => "startpage",
|
||||||
"text" => "Startpage"
|
"text" => "Startpage"
|
||||||
|
@ -203,6 +207,10 @@ $settings = [
|
||||||
"value" => "google",
|
"value" => "google",
|
||||||
"text" => "Google"
|
"text" => "Google"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "google_cse",
|
||||||
|
"text" => "Google CSE"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "startpage",
|
"value" => "startpage",
|
||||||
"text" => "Startpage"
|
"text" => "Startpage"
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
body{
|
body{
|
||||||
padding:15px 4% 40px;
|
padding:15px 4% 40px;
|
||||||
|
margin:unset;
|
||||||
}
|
}
|
||||||
|
|
||||||
h1,h2,h3,h4,h5,h6{
|
h1,h2,h3,h4,h5,h6{
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
:root
|
||||||
|
{
|
||||||
|
--accent : #f79e98;
|
||||||
|
--1d2021 : #180d0c;
|
||||||
|
--282828 : #180d0c;
|
||||||
|
--3c3836 : #251615;
|
||||||
|
--504945 : #251615;
|
||||||
|
--928374 : var(--accent);
|
||||||
|
--a89984 : #d8c5c4;
|
||||||
|
--bdae93 : #d8c5c4;
|
||||||
|
--8ec07c : var(--accent);
|
||||||
|
--ebdbb2 : #d8c5c4;
|
||||||
|
--comment: #928374;
|
||||||
|
--default: #DCC9BC;
|
||||||
|
--keyword: #F07342;
|
||||||
|
--string : var(--accent);
|
||||||
|
--green : #959A6B;
|
||||||
|
--yellow : #E39C45;
|
||||||
|
--red : #CF223E;
|
||||||
|
--white : var(--a89984);
|
||||||
|
--black : var(--1d2021);
|
||||||
|
--hover : #b18884
|
||||||
|
}
|
||||||
|
|
||||||
|
a.link, a { color: var(--accent); text-decoration: none; }
|
||||||
|
.searchbox { width: 23%; }
|
||||||
|
.filters filter select { color: #E39C45; }
|
||||||
|
.web .separator::before { color: var(--white) }
|
||||||
|
.searchbox input[type="text"]::placeholder { color: var(--white); }
|
||||||
|
a.link:hover
|
||||||
|
{
|
||||||
|
color: var(--hover);
|
||||||
|
text-shadow: 0 0 .2rem var(--hover);
|
||||||
|
}
|
||||||
|
.code-inline
|
||||||
|
{ border-color: var(--default); font-family: monospace;}
|
||||||
|
.home #center a
|
||||||
|
{ color: var(--accent); }
|
||||||
|
.home .subtext
|
||||||
|
{ color: var(--white); }
|
Loading…
Reference in New Issue