re-added solofield, added mullvad for brave and google

This commit is contained in:
2025-09-07 14:26:51 -04:00
parent 0c90c4bc9e
commit 73f8472eec
5 changed files with 407 additions and 2 deletions

View File

@@ -43,7 +43,7 @@ class config{
// If this regex expression matches on the user agent, it blocks the request
// Not useful at all against a targetted attack
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant/i';
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant|meta/i';
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
@@ -118,7 +118,7 @@ class config{
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
// Changing this might break things.
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0";
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0";
// Proxy pool assignments for each scraper
// false = Use server's raw IP
@@ -130,6 +130,8 @@ class config{
const PROXY_GOOGLE = false;
const PROXY_GOOGLE_API = false;
const PROXY_GOOGLE_CSE = false;
const PROXY_MULLVAD_GOOGLE = false;
const PROXY_MULLVAD_BRAVE = false;
const PROXY_STARTPAGE = false;
const PROXY_QWANT = false;
const PROXY_BAIDU = false;
@@ -143,6 +145,7 @@ class config{
const PROXY_WIBY = false;
const PROXY_CURLIE = false;
const PROXY_YT = false; // youtube
const PROXY_ARCHIVEORG = false;
const PROXY_SEPIASEARCH = false;
const PROXY_ODYSEE = false;
const PROXY_VIMEO = false;

342
scraper/mullvad.php Normal file
View File

@@ -0,0 +1,342 @@
<?php
class mullvad{
public function __construct($engine){
$this->engine = $engine;
include "lib/backend.php";
$this->backend = new backend("mullvad_{$this->engine}");
}
public function getfilters($page){
return [
"country" => [ // &country=
"display" => "Country",
"option" => [
"any" => "Any country",
"ar" => "Argentina",
"au" => "Australia",
"at" => "Austria",
"be" => "Belgium",
"br" => "Brazil",
"ca" => "Canada",
"cl" => "Chile",
"cn" => "China",
"dk" => "Denmark",
"fi" => "Finland",
"fr" => "France",
"de" => "Germany",
"hk" => "Hong Kong",
"in" => "India",
"id" => "Indonesia",
"it" => "Italy",
"jp" => "Japan",
"kr" => "Korea, Republic",
"my" => "Malaysia",
"mx" => "Mexico",
"nl" => "Netherlands",
"nz" => "New Zealand",
"no" => "Norway",
"ph" => "Philippines",
"pl" => "Poland",
"pt" => "Portugal",
"ru" => "Russian Federation",
"sa" => "Saudi Arabia",
"za" => "South Africa",
"es" => "Spain",
"se" => "Sweden",
"ch" => "Switzerland",
"tw" => "Taiwan",
"tr" => "Turkey",
"uk" => "United Kingdom",
"us" => "United States"
]
],
"language" => [ // &language=
"display" => "Language",
"option" => [
"any" => "Any language",
"ar" => "Arabic",
"bg" => "Bulgarian",
"ca" => "Catalan",
"zh-hans" => "Chinese (Simplified)",
"zh-hant" => "Chinese (Traditional)",
"hr" => "Croatian",
"cs" => "Czech",
"da" => "Danish",
"nl" => "Dutch",
"en" => "English",
"et" => "Estonian",
"fi" => "Finnish",
"fr" => "French",
"de" => "German",
"he" => "Hebrew",
"hu" => "Hungarian",
"is" => "Icelandic",
"it" => "Italian",
"jp" => "Japanese",
"ko" => "Korean",
"lv" => "Latvian",
"lt" => "Lithuanian",
"nb" => "Norwegian",
"pl" => "Polish",
"pt" => "Portuguese",
"ro" => "Romanian",
"ru" => "Russian",
"sr" => "Serbian",
"sk" => "Slovak",
"sl" => "Slovenian",
"es" => "Spanish",
"sv" => "Swedish",
"tr" => "Turkish"
]
],
"time" => [ // &lastUpdated=
"display" => "Time posted",
"option" => [
"any" => "Any time",
"d" => "Past day",
"w" => "Past week",
"m" => "Past month",
"y" => "Past year"
]
]
];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"Referer: https://leta.mullvad.net/search",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Cookie: engine=brave",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"Priority: u=0",
"TE: trailers"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$params, $proxy] = $this->backend->get($get["npt"], "web");
$params = json_decode($params, true);
}else{
if(strlen($get["s"]) === 0){
throw new Exception("Search term is empty!");
}
// generate filters
$params = [
"q" => $get["s"],
"engine" => $this->engine,
"page" => 1
];
if($get["country"] != "any"){
$params["country"] = $get["country"];
}
if($get["language"] != "any"){
$params["language"] = $get["language"];
}
if($get["time"] != "any"){
$params["lastUpdated"] = $get["time"];
}
$proxy = $this->backend->get_ip();
}
try{
$json = $this->get(
$proxy,
"https://leta.mullvad.net/search/__data.json",
$params
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
if(!isset($json["nodes"])){
throw new Exception("Mullvad did not return a nodes object");
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => $nextpage,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// parse json payload
foreach($json["nodes"] as $node){
if(!isset($node["data"][0]["q"])){
// not iterating through the query object
continue;
}
// node 0 contains pointers to what we need to iterate through
$node0 = &$node["data"][0];
if(!isset($node["data"][$node0["success"]])){
throw new Exception("Mullvad did not return a success object");
}
$success = &$node["data"][$node0["success"]];
if($success === false){
throw new Exception("Mullvad flagged the response as unsuccessful");
}
if(!isset($node["data"][$node0["items"]])){
throw new Exception("Mullvad did not return an items object");
}
$search_pointers = &$node["data"][$node0["items"]];
//
// Iterate over results
//
foreach($search_pointers as $pointer){
$pointer = &$node["data"][$pointer];
$link = &$node["data"][$pointer["link"]];
$title = &$node["data"][$pointer["title"]];
$description = &$node["data"][$pointer["snippet"]];
$date = null;
if($this->engine == "google"){
// attempt to extract date
// Jan 12, 2017
$date_parts = explode(" ... ", $description, 2);
if(
count($date_parts) === 2 &&
strlen($date_parts[0]) < 15
){
$date = strtotime(trim($date_parts[0]));
if($date === false){
$date = null;
}else{
$description = trim($date_parts[1]);
}
}
}
$out["web"][] = [
"title" => $this->titledots($title),
"description" => $this->titledots($description),
"url" => $link,
"date" => $date,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
//
// Get nextpage
//
if(isset($node["data"][$node0["next"]])){
$params["page"] = (int)$node["data"][$node0["next"]];
$out["npt"] =
$this->backend->store(
json_encode($params),
"web",
$proxy
);
}
}
return $out;
}
private function titledots($title){
return trim($title, " .\t\n\r\0\x0B");
}
}

20
scraper/mullvad_brave.php Normal file
View File

@@ -0,0 +1,20 @@
<?php
class mullvad_brave{
public function __construct(){
include "scraper/mullvad.php";
$this->mullvad = new mullvad("brave");
}
public function getfilters($page){
return $this->mullvad->getfilters($page);
}
public function web($get){
return $this->mullvad->web($get);
}
}

View File

@@ -0,0 +1,20 @@
<?php
class mullvad_google{
public function __construct(){
include "scraper/mullvad.php";
$this->mullvad = new mullvad("google");
}
public function getfilters($page){
return $this->mullvad->getfilters($page);
}
public function web($get){
return $this->mullvad->web($get);
}
}

View File

@@ -125,6 +125,10 @@ $settings = [
"value" => "brave",
"text" => "Brave"
],
[
"value" => "mullvad_brave",
"text" => "Mullvad (Brave)"
],
[
"value" => "yandex",
"text" => "Yandex"
@@ -137,6 +141,10 @@ $settings = [
"value" => "google_cse",
"text" => "Google CSE"
],
[
"value" => "mullvad_google",
"text" => "Mullvad (Google)"
],
[
"value" => "startpage",
"text" => "Startpage"
@@ -177,6 +185,10 @@ $settings = [
"value" => "coccoc",
"text" => "Cốc Cốc"
],
[
"value" => "solofield",
"text" => "Solofield"
],
[
"value" => "marginalia",
"text" => "Marginalia"
@@ -231,6 +243,10 @@ $settings = [
"value" => "baidu",
"text" => "Baidu"
],
[
"value" => "solofield",
"text" => "Solofield"
],
[
"value" => "pinterest",
"text" => "Pinterest"
@@ -308,6 +324,10 @@ $settings = [
[
"value" => "coccoc",
"text" => "Cốc Cốc"
],
[
"value" => "solofield",
"text" => "Solofield"
]
]
],