This commit is contained in:
lolcat 2024-04-21 19:31:56 -04:00
parent 9e18327df6
commit 130358a9e0
16 changed files with 1385 additions and 457 deletions

1
.gitignore vendored
View File

@ -29,3 +29,4 @@ data/captcha/minecraft/
banner/*
!banner/*default*
>>>>>>> 77293818cd213ec0ad07c573d298fff9cd5b357d
scraper/curlie.html

View File

@ -11,63 +11,42 @@ https://4get.ca
## Totally unbiased comparison between alternatives
| | 4get | searx(ng) | librex | araa |
|----------------------------|-------------------------|-----------|-------------|----------|
| RAM usage | 200-400mb~ | 2GB~ | 200-400mb~ | 2GB~ |
| Does it suck | no (debunked by snopes) | yes | yes | a little |
| Does it work | ye | no | no | ye |
| Did the dev commit suicide | not until my 30s | idk | yes | no |
| | 4get | searx(ng) | librex | araa |
|----------------------------|-------------------------|-----------|-------------|-----------|
| RAM usage | 200-400mb~ | 2GB~ | 200-400mb~ | 2GB~ |
| Does it suck | no (debunked by snopes) | yes | yes | a little |
| Does it work | ye | sometimes | no | sometimes |
| Did the dev commit suicide | not until my 30s | no | allegedly | no |
## Features
1. Rotating proxies on a per-scraper basis
2. Search filters, which SearxNG lacks for the most part
3. Bot protection that *actually* filters out the bots (when configured)
4. Interface doesn't require javascript
5. Favicon fetcher with caching support & image proxy
6. Bunch of other shit
tl;dr the best way to actually browse for shit.
# Supported websites
1. Web
- DuckDuckGo
- Brave
- Yandex
- Google
- Mwmbl
- Mojeek
- Marginalia
- wiby
- Curlie
2. Images
- DuckDuckGo
- Yandex
- Google
- Brave
- Yep
- Imgur
- FindThatMeme
3. Videos
- YouTube
- DuckDuckgo
- Brave
- Yandex
- Google
4. News
- DuckDuckGo
- Brave
- Google
- Mojeek
5. Music
- SoundCloud
6. Autocompleter
- Brave
- DuckDuckGo
- Yandex
- Google
- Qwant
- Yep
- Marginalia
- YouTube
- SoundCloud
| Web | Images | Videos | News | Music | Autocompleter |
|------------|--------------|------------|------------|------------|---------------|
| DuckDuckGo | DuckDuckGo | YouTube | DuckDuckGo | Soundcloud | Brave |
| Brave | Brave | DuckDuckGo | Brave | | DuckDuckGo |
| Yandex | Yandex | Brave | Google | | Yandex |
| Google | Google | Yandex | Qwant | | Google |
| Qwant | Qwant | Google | Mojeek | | Yep |
| Yep | Pinterest | Qwant | | | Marginalia |
| Crowdview | Yep | | | | YouTube |
| Mwmbl | Imgur | | | | Soundcloud |
| Mojeek | FindThatMeme | | | | |
| Marginalia | | | | | |
| wiby | | | | | |
| Curlie | | | | | |
# Installation
Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/">documentation index</a>!
Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/">documentation index</a>. I recommend following the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">apache2 guide</a>.
## Contact
Shit breaks all the time but I repair it all the time too! Email me here: will (at) lolcat.ca
Shit breaks all the time but I repair it all the time too... Email me here: <b>will (at) lolcat.ca</b> or create an issue.

View File

@ -18,7 +18,7 @@ class autocomplete{
"yep" => "https://api.yep.com/ac/?query={searchTerms}",
"marginalia" => "https://search.marginalia.nu/suggest/?partial={searchTerms}",
"yt" => "https://suggestqueries-clients6.youtube.com/complete/search?client=youtube&q={searchTerms}",
"sc" => "https://api-v2.soundcloud.com/search/queries?q={searchTerms}&client_id=" . config::SC_CLIENT_TOKEN . "&limit=10&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en"
"sc" => ""
];
/*
@ -39,14 +39,6 @@ class autocomplete{
$this->do404("Search(s) exceeds the 500 char length");
}
if(
isset($_GET["scraper"]) &&
is_string($_GET["scraper"]) === false
){
$_GET["scraper"] = "brave"; // default option
}
/*
Get $scraper
*/
@ -77,7 +69,6 @@ class autocomplete{
}
// return results
switch($scraper){
case "google":
@ -115,7 +106,16 @@ class autocomplete{
case "sc":
// soundcloud
$js = $this->get($this->scrapers[$scraper], $_GET["s"]);
chdir("../../");
include "scraper/sc.php";
$sc = new sc();
$token = $sc->get_token("raw_ip::::");
$js = $this->get(
"https://api-v2.soundcloud.com/search/queries?q={searchTerms}&client_id=" . $token . "&limit=10&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en",
$_GET["s"]
);
$js = json_decode($js, true);

View File

@ -5,7 +5,7 @@ class config{
// any parameters.
// 4get version. Please keep this updated
const VERSION = 7;
const VERSION = 8;
// Will be shown pretty much everywhere.
const SERVER_NAME = "4get";
@ -63,13 +63,6 @@ class config{
"via"
];
// @TODO: Portscan the user for open proxies before allowing a connection, block user if any are found
// Requires the nmap package
const NMAP_PROXY_CHECK = false;
// @TODO: Make IP blacklist public under /api/v1/blacklist endpoint ?
const PUBLIC_IP_BLACKLIST = true;
// Maximal number of searches per captcha key/pass issued. Counter gets
// reset on every APCU cache clear (should happen once a day).
// Only useful when BOT_PROTECTION is NOT set to 0
@ -113,7 +106,7 @@ class config{
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
// Changing this might break things.
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0";
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0";
// Proxy pool assignments for each scraper
// false = Use server's raw IP
@ -123,6 +116,7 @@ class config{
const PROXY_BRAVE = false;
const PROXY_FB = false; // facebook
const PROXY_GOOGLE = false;
const PROXY_QWANT = false;
const PROXY_MARGINALIA = false;
const PROXY_MOJEEK = false;
const PROXY_SC = false; // soundcloud
@ -146,14 +140,8 @@ class config{
// Scraper-specific parameters
//
// SOUNDCLOUD
// Get these parameters by making a search on soundcloud with network
// tab open, then filter URLs using "search?q=". (No need to login)
const SC_USER_ID = "447501-577662-794348-352629";
const SC_CLIENT_TOKEN = "VNc62l3wxDWS0Ol62j5UYNc1gsZ3UXPv";
// MARGINALIA
// Get an API key by contacting the Marginalia.nu maintainer. The "public" key
// works but is almost always rate-limited.
const MARGINALIA_API_KEY = "public";
// Use "null" to default out to HTML scraping OR specify a string to
// use the API (Eg: "public"). API has less filters.
const MARGINALIA_API_KEY = null;
}

View File

@ -93,31 +93,31 @@ class backend{
*/
public function store($payload, $page, $proxy){
$page = $page[0];
$password = random_bytes(256); // 2048 bit
$salt = random_bytes(16);
$key = hash_pbkdf2("sha512", $password, $salt, 20000, 32, true);
$iv =
random_bytes(
openssl_cipher_iv_length("aes-256-gcm")
);
$tag = "";
$out = openssl_encrypt($payload, "aes-256-gcm", $key, OPENSSL_RAW_DATA, $iv, $tag, "", 16);
$key = sodium_crypto_secretbox_keygen();
$nonce = random_bytes(SODIUM_CRYPTO_SECRETBOX_NONCEBYTES);
$requestid = apcu_inc("requestid");
apcu_store(
$page . "." .
$this->scraper .
$page[0] . "." . // first letter of page name
$this->scraper . // scraper name
$requestid,
gzdeflate($proxy . "," . $salt.$iv.$out.$tag),
900 // cache information for 15 minutes blaze it
[
$nonce,
$proxy,
// compress and encrypt
sodium_crypto_secretbox(
gzdeflate($payload),
$nonce,
$key
)
],
900 // cache information for 15 minutes
);
return
$this->scraper . $requestid . "." .
rtrim(strtr(base64_encode($password), '+/', '-_'), '=');
rtrim(strtr(base64_encode($key), '+/', '-_'), '=');
}
public function get($npt, $page){
@ -137,7 +137,7 @@ class backend{
if($payload === false){
throw new Exception("The nextPageToken is invalid or has expired!");
throw new Exception("The next page token is invalid or has expired!");
}
$key =
@ -150,47 +150,27 @@ class backend{
)
);
$payload = gzinflate($payload);
// get proxy
[
$proxy,
$payload
] = explode(",", $payload, 2);
$key =
hash_pbkdf2(
"sha512",
$key,
substr($payload, 0, 16), // salt
20000,
32,
true
);
$ivlen = openssl_cipher_iv_length("aes-256-gcm");
$payload =
openssl_decrypt(
substr(
$payload,
16 + $ivlen,
-16
),
"aes-256-gcm",
$key,
OPENSSL_RAW_DATA,
substr($payload, 16, $ivlen),
substr($payload, -16)
// decrypt and decompress data
$payload[2] =
gzinflate(
sodium_crypto_secretbox_open(
$payload[2], // data
$payload[0], // nonce
$key
)
);
if($payload === false){
if($payload[2] === false){
throw new Exception("The nextPageToken is invalid or has expired!");
throw new Exception("The next page token is invalid or has expired!");
}
// remove the key after using
// remove the key after using successfully
apcu_delete($apcu);
return [$payload, $proxy];
return [
$payload[2], // data
$payload[1] // proxy
];
}
}

View File

@ -290,30 +290,24 @@ class proxy{
if(isset($headers["content-type"])){
if($headers["content-type"] == "text/html"){
if(stripos($headers["content-type"], "text/html") !== false){
throw new Exception("Server returned an html document instead of image");
throw new Exception("Server returned html");
}
$tmp = explode(";", $headers["content-type"]);
for($i=0; $i<count($tmp); $i++){
if(
preg_match(
'/image\/([^ ]+)/i',
$headers["content-type"],
$match
)
){
if(
preg_match(
'/^image\/([^ ]+)/i',
$tmp[$i],
$match
)
){
$format = strtolower($match[1]);
if(substr(strtolower($format), 0, 2) == "x-"){
$format = strtolower($match[1]);
if(substr($format, 0, 2) == "x-"){
$format = substr($format, 2);
}
break;
$format = substr($format, 2);
}
}
}
@ -351,6 +345,8 @@ class proxy{
private function stream($url, $referer, $format){
$this->clientcache();
$this->url = $url;
$this->format = $format;
@ -360,8 +356,6 @@ class proxy{
throw new Exception("Invalid URL");
}
$this->clientcache();
$curl = curl_init();
// set headers
@ -490,11 +484,14 @@ class proxy{
// get content type
if(isset($this->headers["content-type"])){
$filetype = explode("/", $this->headers["content-type"]);
$octet_check = stripos($this->headers["content-type"], "octet-stream");
if(strtolower($filetype[0]) != $this->format){
if(
stripos($this->headers["content-type"], $this->format) === false &&
$octet_check === false
){
throw new Exception("Resource is not an {$this->format} (Found {$filetype[0]} instead)");
throw new Exception("Resource reported invalid Content-Type");
}
}else{
@ -502,6 +499,18 @@ class proxy{
throw new Exception("Resource is not an {$this->format} (no Content-Type)");
}
$filetype = explode("/", $this->headers["content-type"]);
if(!isset($filetype[1])){
throw new Exception("Malformed Content-Type header");
}
if($octet_check !== false){
$filetype[1] = "jpeg";
}
header("Content-Type: {$this->format}/{$filetype[1]}");
// give payload size
@ -541,7 +550,7 @@ class proxy{
if(isset($filename[1])){
header("Content-Disposition: filename=" . $filename[1] . "." . $filetype);
header("Content-Disposition: filename=\"" . trim($filename[1], "\"'") . "." . $filetype . "\"");
return;
}
}
@ -552,7 +561,7 @@ class proxy{
if($filename === null){
// everything failed! rename file to domain name
header("Content-Disposition: filename=" . parse_url($url, PHP_URL_HOST) . "." . $filetype);
header("Content-Disposition: filename=\"" . parse_url($url, PHP_URL_HOST) . "." . $filetype . "\"");
return;
}
@ -569,7 +578,7 @@ class proxy{
$filename = implode(".", $filename);
header("Content-Disposition: inline; filename=" . $filename . "." . $filetype);
header("Content-Disposition: inline; filename=\"" . $filename . "." . $filetype . "\"");
return;
}

View File

@ -923,6 +923,7 @@ class frontend{
"brave" => "Brave",
"yandex" => "Yandex",
"google" => "Google",
"qwant" => "Qwant",
"yep" => "Yep",
"crowdview" => "Crowdview",
"mwmbl" => "Mwmbl",
@ -942,6 +943,7 @@ class frontend{
"yandex" => "Yandex",
"brave" => "Brave",
"google" => "Google",
"qwant" => "Qwant",
"yep" => "Yep",
//"pinterest" => "Pinterest",
"imgur" => "Imgur",
@ -959,7 +961,8 @@ class frontend{
"ddg" => "DuckDuckGo",
"brave" => "Brave",
"yandex" => "Yandex",
"google" => "Google"
"google" => "Google",
"qwant" => "Qwant"
]
];
break;
@ -971,6 +974,7 @@ class frontend{
"ddg" => "DuckDuckGo",
"brave" => "Brave",
"google" => "Google",
"qwant" => "Qwant",
"yep" => "Yep",
"mojeek" => "Mojeek"
]
@ -1010,98 +1014,8 @@ class frontend{
$scraper_out = $first;
}
switch($scraper_out){
case "ddg":
include "scraper/ddg.php";
$lib = new ddg();
break;
case "brave":
include "scraper/brave.php";
$lib = new brave();
break;
case "yt";
include "scraper/youtube.php";
$lib = new youtube();
break;
case "yandex":
include "scraper/yandex.php";
$lib = new yandex();
break;
case "google":
include "scraper/google.php";
$lib = new google();
break;
/*
case "fb":
include "scraper/facebook.php";
$lib = new facebook();
break;*/
case "crowdview":
include "scraper/crowdview.php";
$lib = new crowdview();
break;
case "mwmbl":
include "scraper/mwmbl.php";
$lib = new mwmbl();
break;
case "mojeek":
include "scraper/mojeek.php";
$lib = new mojeek();
break;
case "marginalia":
include "scraper/marginalia.php";
$lib = new marginalia();
break;
case "wiby":
include "scraper/wiby.php";
$lib = new wiby();
break;
case "curlie":
include "scraper/curlie.php";
$lib = new curlie();
break;
case "yep":
include "scraper/yep.php";
$lib = new yep();
break;
case "sc":
include "scraper/sc.php";
$lib = new sc();
break;
case "spotify":
include "scraper/spotify.php";
$lib = new spotify();
break;
case "pinterest":
include "scraper/pinterest.php";
$lib = new pinterest();
break;
case "imgur":
include "scraper/imgur.php";
$lib = new imgur();
break;
case "ftm":
include "scraper/ftm.php";
$lib = new ftm();
break;
}
include "scraper/$scraper_out.php";
$lib = new $scraper_out();
// set scraper on $_GET
$_GET["scraper"] = $scraper_out;

View File

@ -24,13 +24,36 @@ try{
}
// bing request, ask bing to resize and stream to browser
$image = parse_url($_GET["i"]);
if(
isset($image["host"]) &&
preg_match(
'/bing.net$/',
parse_url($_GET["i"], PHP_URL_HOST)
'/^[A-z0-9.]*bing\.(net|com)$/i',
$image["host"]
)
){
if(
!isset($image["query"]) ||
!isset($image["path"]) ||
$image["path"] != "/th"
){
header("X-Error: Invalid bing image path");
$proxy->do404();
die();
}
parse_str($image["query"], $str);
if(!isset($str["id"])){
header("X-Error: Missing bing ID");
$proxy->do404();
die();
}
switch($_GET["s"]){
case "portrait": $req = "&w=50&h=90&p=0&qlt=90"; break;
@ -40,7 +63,7 @@ try{
case "cover": $req = "&w=207&h=270&p=0&qlt=90"; break;
}
$proxy->stream_linear_image($_GET["i"] . $req, "https://bing.net");
$proxy->stream_linear_image("https://" . $image["host"] . "/th?id=" . urlencode($str["id"]) . $req, "https://www.bing.com");
die();
}

View File

@ -3,78 +3,103 @@
class marginalia{
public function __construct(){
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/backend.php";
$this->backend = new backend("marginalia");
}
public function getfilters($page){
switch($page){
if(config::MARGINALIA_API_KEY === null){
case "web":
return [
"profile" => [
"display" => "Profile",
"option" => [
"any" => "Default",
"modern" => "Modern"
]
],
"format" => [
"display" => "Format",
"option" => [
"any" => "Any",
"html5" => "html5",
"xhtml" => "xhtml",
"html123" => "html123"
]
],
"file" => [
"display" => "File",
"option" => [
"any" => "Any",
"nomedia" => "Deny media",
"media" => "Contains media",
"audio" => "Contains audio",
"video" => "Contains video",
"archive" => "Contains archive",
"document" => "Contains document"
]
],
"javascript" => [
"display" => "Javascript",
"option" => [
"any" => "Allow JS",
"deny" => "Deny JS",
"require" => "Require JS"
]
],
"trackers" => [
"display" => "Trackers",
"option" => [
"any" => "Allow trackers",
"deny" => "Deny trackers",
"require" => "Require trackers"
]
],
"cookies" => [
"display" => "Cookies",
"option" => [
"any" => "Allow cookies",
"deny" => "Deny cookies",
"require" => "Require cookies"
]
],
"affiliate" => [
"display" => "Affiliate links in body",
"option" => [
"any" => "Allow affiliate links",
"deny" => "Deny affiliate links",
"require" => "Require affiliate links"
]
$base = [
"adtech" => [
"display" => "Reduce adtech",
"option" => [
"no" => "No",
"yes" => "Yes"
]
];
],
"recent" => [
"display" => "Recent results",
"option" => [
"no" => "No",
"yes" => "Yes"
]
],
"intitle" => [
"display" => "Search in title",
"option" => [
"no" => "No",
"yes" => "Yes"
]
]
];
}else{
$base = [];
}
return array_merge(
$base,
[
"format" => [
"display" => "Format",
"option" => [
"any" => "Any format",
"html5" => "html5",
"xhtml" => "xhtml",
"html123" => "html123"
]
],
"file" => [
"display" => "Filetype",
"option" => [
"any" => "Any filetype",
"nomedia" => "Deny media",
"media" => "Contains media",
"audio" => "Contains audio",
"video" => "Contains video",
"archive" => "Contains archive",
"document" => "Contains document"
]
],
"javascript" => [
"display" => "Javascript",
"option" => [
"any" => "Allow JS",
"deny" => "Deny JS",
"require" => "Require JS"
]
],
"trackers" => [
"display" => "Trackers",
"option" => [
"any" => "Allow trackers",
"deny" => "Deny trackers",
"require" => "Require trackers"
]
],
"cookies" => [
"display" => "Cookies",
"option" => [
"any" => "Allow cookies",
"deny" => "Deny cookies",
"require" => "Require cookies"
]
],
"affiliate" => [
"display" => "Affiliate links in body",
"option" => [
"any" => "Allow affiliate links",
"deny" => "Deny affiliate links",
"require" => "Require affiliate links"
]
]
]
);
}
private function get($proxy, $url, $get = []){
@ -132,7 +157,6 @@ class marginalia{
throw new Exception("Search term is empty!");
}
$profile = $get["profile"];
$format = $get["format"];
$file = $get["file"];
@ -180,38 +204,6 @@ class marginalia{
$search = implode(" ", $search);
$params = [
"count" => 20
];
if($profile == "modern"){
$params["index"] = 1;
}
try{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage
"https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
$params
);
}catch(Exception $error){
throw new Exception("Failed to get JSON");
}
if($json == "Slow down"){
throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
}
$json = json_decode($json, true);
/*
$handle = fopen("scraper/marginalia.json", "r");
$json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true);
fclose($handle);*/
$out = [
"status" => "ok",
"spelling" => [
@ -228,19 +220,169 @@ class marginalia{
"related" => []
];
foreach($json["results"] as $result){
if(config::MARGINALIA_API_KEY !== null){
try{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage
"https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
[
"count" => 20
]
);
}catch(Exception $error){
throw new Exception("Failed to get JSON");
}
if($json == "Slow down"){
throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
}
$json = json_decode($json, true);
foreach($json["results"] as $result){
$out["web"][] = [
"title" => $result["title"],
"description" => str_replace("\n", " ", $result["description"]),
"url" => $result["url"],
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
// no more cloudflare!! Parse html by default
$params = [
"query" => $search
];
foreach(["adtech", "recent", "intitle"] as $v){
if($get[$v] == "yes"){
switch($v){
case "adtech": $params["adtech"] = "reduce"; break;
case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break;
}
}
}
try{
$html =
$this->get(
$this->backend->get_ip(),
"https://search.marginalia.nu/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
$this->fuckhtml->load($html);
$sections =
$this->fuckhtml
->getElementsByClassName(
"card search-result",
"section"
);
foreach($sections as $section){
$this->fuckhtml->load($section);
$title =
$this->fuckhtml
->getElementsByClassName(
"title",
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName(
"description",
"p"
);
if(count($description) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
$description = null;
}
$sublinks = [];
$sublink_html =
$this->fuckhtml
->getElementsByClassName("additional-results");
if(count($sublink_html) !== 0){
$this->fuckhtml->load($sublink_html[0]);
$links =
$this->fuckhtml
->getElementsByTagName("a");
foreach($links as $link){
$sublinks[] = [
"title" =>
$this->fuckhtml
->getTextContent(
$link
),
"date" => null,
"description" => null,
"url" =>
$this->fuckhtml
->getTextContent(
$link["attributes"]["href"]
)
];
}
}
$out["web"][] = [
"title" => $result["title"],
"description" => str_replace("\n", " ", $result["description"]),
"url" => $result["url"],
"title" =>
$this->fuckhtml
->getTextContent(
$title
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$title["attributes"]["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"sublink" => $sublinks,
"table" => []
];
}

View File

@ -4,11 +4,8 @@ class pinterest{
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("pinterest");
include "lib/proxy_pool.php";
$this->proxy = new proxy_pool("pinterest");
include "lib/backend.php";
$this->backend = new backend("pinterest");
}
public function getfilters($page){

893
scraper/qwant.php Normal file
View File

@ -0,0 +1,893 @@
<?php
class qwant{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("qwant");
}
public function getfilters($page){
$base = [
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"maybe" => "Maybe",
"no" => "No"
]
],
"country" => [
"display" => "Country",
"option" => [
"en_US" => "United States",
"fr_FR" => "France",
"en_GB" => "Great Britain",
"de_DE" => "Germany",
"it_IT" => "Italy",
"es_AR" => "Argentina",
"en_AU" => "Australia",
"es_ES" => "Spain (es)",
"ca_ES" => "Spain (ca)",
"cs_CZ" => "Czech Republic",
"ro_RO" => "Romania",
"el_GR" => "Greece",
"zh_CN" => "China",
"zh_HK" => "Hong Kong",
"en_NZ" => "New Zealand",
"fr_FR" => "France",
"th_TH" => "Thailand",
"ko_KR" => "South Korea",
"sv_SE" => "Sweden",
"nb_NO" => "Norway",
"da_DK" => "Denmark",
"hu_HU" => "Hungary",
"et_EE" => "Estonia",
"es_MX" => "Mexico",
"es_CL" => "Chile",
"en_CA" => "Canada (en)",
"fr_CA" => "Canada (fr)",
"en_MY" => "Malaysia",
"bg_BG" => "Bulgaria",
"fi_FI" => "Finland",
"pl_PL" => "Poland",
"nl_NL" => "Netherlands",
"pt_PT" => "Portugal",
"de_CH" => "Switzerland (de)",
"fr_CH" => "Switzerland (fr)",
"it_CH" => "Switzerland (it)",
"de_AT" => "Austria",
"fr_BE" => "Belgium (fr)",
"nl_BE" => "Belgium (nl)",
"en_IE" => "Ireland",
"he_IL" => "Israel"
]
]
];
switch($page){
case "web":
$base = array_merge(
$base,
[
"time" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"day" => "Past 24 hours",
"week" => "Past week",
"month" => "Past month"
]
],
"extendedsearch" => [
// no display, wont show in interface
"option" => [
"yes" => "Yes",
"no" => "No"
]
]
]
);
break;
case "images":
$base = array_merge(
$base,
[
"time" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"day" => "Past 24 hours",
"week" => "Past week",
"month" => "Past month"
]
],
"size" => [
"display" => "Size",
"option" => [
"any" => "Any size",
"large" => "Large",
"medium" => "Medium",
"small" => "Small"
]
],
"color" => [
"display" => "Color",
"option" => [
"any" => "Any color",
"coloronly" => "Color only",
"monochrome" => "Monochrome",
"black" => "Black",
"brown" => "Brown",
"gray" => "Gray",
"white" => "White",
"yellow" => "Yellow",
"orange" => "Orange",
"red" => "Red",
"pink" => "Pink",
"purple" => "Purple",
"blue" => "Blue",
"teal" => "Teal",
"green" => "Green"
]
],
"imagetype" => [
"display" => "Type",
"option" => [
"any" => "Any type",
"animatedgif" => "Animated GIF",
"photo" => "Photograph",
"transparent" => "Transparent"
]
],
"license" => [
"display" => "License",
"option" => [
"any" => "Any license",
"share" => "Non-commercial reproduction and sharing",
"sharecommercially" => "Reproduction and sharing",
"modify" => "Non-commercial reproduction, sharing and modification",
"modifycommercially" => "Reproduction, sharing and modification",
"public" => "Public domain"
]
]
]
);
break;
case "videos":
$base = array_merge(
$base,
[
"order" => [
"display" => "Order by",
"option" => [
"relevance" => "Relevance",
"views" => "Views",
"date" => "Most recent",
]
],
"source" => [
"display" => "Source",
"option" => [
"any" => "Any source",
"youtube" => "YouTube",
"dailymotion" => "Dailymotion",
]
]
]
);
break;
case "news":
$base = array_merge(
$base,
[
"time" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"hour" => "Less than 1 hour ago",
"day" => "Past 24 hours",
"week" => "Past week",
"month" => "Past month"
]
],
"order" => [
"display" => "Order by",
"option" => [
"relevance" => "Relevance",
"date" => "Most recent"
]
]
]
);
break;
}
return $base;
}
private function get($proxy, $url, $get = []){
$headers = [
"User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/plain, */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Origin: https://www.qwant.com",
"Referer: https://www.qwant.com/",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"TE: trailers"
];
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
// Bypass HTTP/2 check
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
// get next page data
[$params, $proxy] = $this->backend->get($get["npt"], "web");
$params = json_decode($params, true);
}else{
// get _GET data instead
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
$proxy = $this->backend->get_ip();
$params = [
"q" => $search,
"freshness" => $get["time"],
"count" => 10,
"locale" => $get["country"],
"offset" => 0,
"device" => "desktop",
"tgp" => 3,
"safesearch" => 0,
"displayed" => "true"
];
switch($get["nsfw"]){
case "yes": $params["safesearch"] = 0; break;
case "maybe": $params["safesearch"] = 1; break;
case "no": $params["safesearch"] = 2; break;
}
}
/*
$handle = fopen("scraper/qwant_web.json", "r");
$json = fread($handle, filesize("scraper/qwant_web.json"));
fclose($handle);*/
try{
$json =
$this->get(
$proxy,
"https://fdn.qwant.com/v3/search/web",
$params
);
}catch(Exception $error){
throw new Exception("Could not fetch JSON");
}
$json = json_decode($json, true);
if($json === NULL){
throw new Exception("Failed to decode JSON");
}
if(isset($json["data"]["message"][0])){
throw new Exception("Server returned an error:\n" . $json["data"]["message"][0]);
}
if($json["status"] != "success"){
if($json["data"]["error_code"] === 5){
return $out;
}
throw new Exception("Server returned an error code: " . $json["data"]["error_code"]);
}
if(!isset($json["data"]["result"]["items"]["mainline"])){
throw new Exception("Server did not return a result object");
}
// data is OK, parse
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// get instant answer
if(
$get["extendedsearch"] == "yes" &&
isset($json["data"]["result"]["items"]["sidebar"][0]["endpoint"])
){
try{
$answer =
$this->get(
$proxy,
"https://api.qwant.com/v3" .
$json["data"]["result"]["items"]["sidebar"][0]["endpoint"],
[]
);
$answer = json_decode($answer, true);
if(
$answer === null ||
$answer["status"] != "success" ||
$answer["data"]["result"] === null
){
throw new Exception();
}
// parse answer
$out["answer"][] = [
"title" => $answer["data"]["result"]["title"],
"description" => [
[
"type" => "text",
"value" => $this->trimdots($answer["data"]["result"]["description"])
]
],
"url" => $answer["data"]["result"]["url"],
"thumb" =>
$answer["data"]["result"]["thumbnail"]["landscape"] == null ?
null :
$this->unshitimage(
$answer["data"]["result"]["thumbnail"]["landscape"],
false
),
"table" => [],
"sublink" => []
];