Compare commits

..

18 Commits

Author SHA1 Message Date
95819bfe52 yep api fix 2026-05-20 11:05:41 -04:00
e1e92d715e add support for yep api 2026-05-20 11:01:18 -04:00
394f401921 Merge pull request 'add lwthiker/curl-impersonate' (#94) from docker-curl-impersonate into master
Reviewed-on: #94
2026-05-20 05:02:30 +00:00
25e8095d0d yep scraper fix? still getting tons of CF errors... 2026-05-20 00:59:27 -04:00
cf3c77ed04 add lwthiker/curl-impersonate 2026-05-19 17:08:10 -07:00
c45f8b1e12 fix startpage pagination and fuckups with word corrections 2026-05-09 22:24:36 -04:00
6086c63148 extract_json fix 2026-05-05 03:32:45 -04:00
d2b0a414ad yandex videos fix 2026-05-02 17:44:53 -04:00
c713d52b5f google scraper fix.. haha just kidding i modified the readme file 2026-05-01 20:10:51 -04:00
0861450b8a detect pinterest captcha 2026-04-29 01:29:54 -04:00
88012f6ae2 i hate git 2026-04-29 01:20:04 -04:00
0dabcea0aa link fix 2026-04-29 01:16:50 -04:00
a8022d22a7 remove settings 2026-04-26 16:24:29 -04:00
9ea0372bb7 remove extremely low quality scrapers 2026-04-26 16:22:24 -04:00
a54f212550 yep scraper fix, removed image and news 2026-04-26 05:08:15 -04:00
b1f5974e40 new google message 2026-04-25 23:03:42 -04:00
e63a17d6db added google api image scraper 2026-04-25 22:58:28 -04:00
4349bf232d captcha bruteforce fix 2026-04-11 17:32:20 -04:00
18 changed files with 890 additions and 2410 deletions

View File

@@ -1,8 +1,17 @@
FROM lwthiker/curl-impersonate:0.6.1-ff-alpine AS curl-impersonate
FROM alpine:3.21 FROM alpine:3.21
WORKDIR /var/www/html/4get WORKDIR /var/www/html/4get
RUN apk update && apk upgrade RUN apk update && apk upgrade
RUN apk add php apache2-ssl php84-fileinfo php84-openssl php84-iconv php84-common php84-dom php84-sodium php84-curl curl php84-pecl-apcu php84-apache2 imagemagick php84-pecl-imagick php84-mbstring imagemagick-webp imagemagick-jpeg RUN apk add php apache2-ssl php84-fileinfo php84-openssl php84-iconv php84-common php84-dom php84-sodium php84-curl curl php84-pecl-apcu php84-apache2 imagemagick php84-pecl-imagick php84-mbstring imagemagick-webp imagemagick-jpeg nss ca-certificates
COPY --from=curl-impersonate /usr/local/bin /usr/local/bin
COPY --from=curl-impersonate /usr/local/lib /usr/local/lib
ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate-ff.so
ENV CURL_IMPERSONATE=ff117
ENV CURL_IMPERSONATE_HEADERS=no
COPY . . COPY . .
@@ -14,4 +23,4 @@ EXPOSE 443
ENV FOURGET_PROTO=http ENV FOURGET_PROTO=http
ENTRYPOINT ["./docker/docker-entrypoint.sh"] ENTRYPOINT ["./docker/docker-entrypoint.sh"]
CMD ["start"] CMD ["start"]

View File

@@ -13,11 +13,11 @@ _NOT to be confused with 4get.ch, 4get.lol and friends! I **don't** host these._
## Totally unbiased comparison between alternatives ## Totally unbiased comparison between alternatives
| | 4get | searx(ng) | libreY | araa | hearch.co | | | 4get | searx(ng) | whoogle | degoog |
|----------------------------|-------------------------|-----------|-------------|-----------|-------------------| |----------------------------|-------------------------|-----------|------------|--------------------------------------|
| RAM usage | 200-400mb~ | 2GB~ | 200-400mb~ | 2GB~ | idk | | RAM usage | 100-400mb~ | 400mb-1GB | 100mb | 200mb-1GB |
| Does it suck | no (debunked by snopes) | yes | yes | a little | better than searx | | Does it suck | no (debunked by snopes) | yes | kind of? | its kinda cool but no search filters |
| Does it work | ye | sometimes | sometimes | sometimes | yes | | Does it work | ye | lmao | shits dead | works right now... |
## Features ## Features
1. Rotating proxies on a per-scraper basis 1. Rotating proxies on a per-scraper basis
@@ -31,25 +31,29 @@ tl;dr 4get is the best way to browse for shit.
# Supported websites # Supported websites
| Web | Images | Videos | News | Music | Autocompleter | | web | images | videos | news | music | autocomplete |
|------------|--------------|--------------|------------|------------|---------------| |--------------|--------------|--------------|--------------|------------|--------------|
| DuckDuckGo | DuckDuckGo | YouTube | DuckDuckGo | Soundcloud | Brave | | DuckDuckGo | DuckDuckGo | YouTube | DuckDuckGo | SoundCloud | Brave |
| Brave | Brave | Sepia Search | Brave | | DuckDuckGo | | Brave | Yandex | Vimeo | Brave | Swisscows | DuckDuckGo |
| Yandex | Yandex | DuckDuckGo | Google | | Yandex | | Yandex | Brave | Sepia Search | Google | | Yandex |
| Google | Google | Brave | Startpage | | Google | | Google | Google | DuckDuckGo | Yahoo! JAPAN | | Google |
| Startpage | Startpage | Yandex | Qwant | | Startpage | | Google API | Google API | Brave | Startpage | | Startpage |
| Qwant | Qwant | Google | Mojeek | | Kagi | | Google CSE | Google CSE | Yandex | Qwant | | Kagi |
| Ghostery | Yep | Startpage | Baidu | | Qwant | | Yahoo! JAPAN | Yahoo! JAPAN | Google | Mojeek | | Qwant |
| Yep | Baidu | Qwant | | | Ghostery | | Startpage | Startpage | Yahoo! JAPAN | Baidu | | Ghostery |
| Greppr | Pinterest | Baidu | | | Yep | | Qwant | Qwant | Startpage | | | Yep |
| Crowdview | 500px | Coc Coc | | | Marginalia | | Ghostery | Baidu | Qwant | | | Marginalia |
| Mwmbl | VSCO | | | | YouTube | | Yep | Solofield | Baidu | | | YouTube |
| Mojeek | Imgur | | | | Soundcloud | | Mwmbl | Pinterest | Coc Coc | | | SoundCloud |
| Baidu | FindThatMeme | | | | | | Mojeek | Cara | Solofield | | | |
| Coc Coc | | | | | | | Baidu | Flickr | | | | |
| Marginalia | | | | | | | Coc Coc | Pexels | | | | |
| wiby | | | | | | | Solofield | Pixabay | | | | |
| Curlie | | | | | | | Marginalia | Unsplash | | | | |
| wiby | 500px | | | | |
| | VSCO | | | | |
| | Imgur | | | | |
| | FindThatMeme | | | | |
# Installation # Installation
Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/">documentation index</a>. I recommend following the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">apache2 guide</a>. Refer to the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/">documentation index</a>. I recommend following the <a href="https://git.lolcat.ca/lolcat/4get/src/branch/master/docs/apache2.md">apache2 guide</a>.

1
data/api_keys/yep.txt Normal file
View File

@@ -0,0 +1 @@
# Paste Yep API keys here

View File

@@ -23,6 +23,16 @@ class config{
// Enable the API? // Enable the API?
const API_ENABLED = true; const API_ENABLED = true;
//
// 4play (session provider)
//
// Enable 4play API?
const FPLAY_ENABLE_API = true;
// 4play password. Please set this to something secure if you enable the 4play API.
// This password is used to POST sessions to /api/v2/provide_sesh
const FPLAY_PASSWORD = "1234";
// //
// BOT PROTECTION // BOT PROTECTION
// //
@@ -118,10 +128,10 @@ class config{
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages // Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
// Changing this might break things. // Changing this might break things.
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:149.0) Gecko/20100101 Firefox/149.0"; const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:151.0) Gecko/20100101 Firefox/151.0";
// User agent to use with 4get-friendly APIs // User agent to use with 4get-friendly APIs
const USER_AGENT_FRIENDLY = "4get-scrapist"; const USER_AGENT_FRIENDLY = "4get-scrapist (+https://4get.ca)";
// Proxy pool assignments for each scraper // Proxy pool assignments for each scraper
// false = Use server's raw IP // false = Use server's raw IP
@@ -131,7 +141,6 @@ class config{
const PROXY_YAHOO = false; const PROXY_YAHOO = false;
const PROXY_YAHOO_JAPAN = false; const PROXY_YAHOO_JAPAN = false;
const PROXY_BRAVE = false; const PROXY_BRAVE = false;
const PROXY_FB = false; // facebook
const PROXY_GOOGLE = false; const PROXY_GOOGLE = false;
const PROXY_GOOGLE_API = false; const PROXY_GOOGLE_API = false;
const PROXY_GOOGLE_CSE = false; const PROXY_GOOGLE_CSE = false;
@@ -155,7 +164,6 @@ class config{
const PROXY_VIMEO = false; const PROXY_VIMEO = false;
const PROXY_YEP = false; const PROXY_YEP = false;
const PROXY_PINTEREST = false; const PROXY_PINTEREST = false;
const PROXY_SANKAKUCOMPLEX = false;
const PROXY_FLICKR = false; const PROXY_FLICKR = false;
const PROXY_PIXABAY = false; const PROXY_PIXABAY = false;
const PROXY_UNSPLASH = false; const PROXY_UNSPLASH = false;
@@ -164,8 +172,6 @@ class config{
const PROXY_VSCO = false; const PROXY_VSCO = false;
const PROXY_SEZNAM = false; const PROXY_SEZNAM = false;
const PROXY_NAVER = false; const PROXY_NAVER = false;
const PROXY_GREPPR = false;
const PROXY_CROWDVIEW = false;
const PROXY_MWMBL = false; const PROXY_MWMBL = false;
const PROXY_FTM = false; // findthatmeme const PROXY_FTM = false; // findthatmeme
const PROXY_IMGUR = false; const PROXY_IMGUR = false;
@@ -173,6 +179,11 @@ class config{
const PROXY_YANDEX_W = false; // yandex web const PROXY_YANDEX_W = false; // yandex web
const PROXY_YANDEX_I = false; // yandex images const PROXY_YANDEX_I = false; // yandex images
const PROXY_YANDEX_V = false; // yandex videos const PROXY_YANDEX_V = false; // yandex videos
const PROXY_SAFEBOORU = false;
const PROXY_KONACHAN = false;
const PROXY_YANDERE = false;
const PROXY_TBIB = false;
const PROXY_GELBOORU = false;
// //
// Scraper-specific parameters // Scraper-specific parameters
@@ -185,4 +196,7 @@ class config{
// Use "null" to default out to HTML scraping OR specify a string to // Use "null" to default out to HTML scraping OR specify a string to
// use the API (Eg: "public"). API has less filters. // use the API (Eg: "public"). API has less filters.
const MARGINALIA_API_KEY = null; const MARGINALIA_API_KEY = null;
// Yep
const YEP_USE_API = false;
} }

View File

@@ -133,6 +133,9 @@ class bot_protection{
$answers[] = $regex; $answers[] = $regex;
} }
// dedup
$answers = array_unique($answers);
if( if(
!$invalid && !$invalid &&
$key !== false // has captcha been gen'd? $key !== false // has captcha been gen'd?

View File

@@ -2,6 +2,52 @@
class frontend{ class frontend{
public function validateurl($url, $net_validate = false){
$url_parts = parse_url($url);
// check if required parts are there
if(
!isset($url_parts["scheme"]) ||
!(
$url_parts["scheme"] == "http" ||
$url_parts["scheme"] == "https"
) ||
!isset($url_parts["host"])
){
return false;
}
if($net_validate){
$ip =
str_replace(
["[", "]"], // handle ipv6
"",
$url_parts["host"]
);
// if its not an IP
if(!filter_var($ip, FILTER_VALIDATE_IP)){
// resolve domain's IP
$ip = gethostbyname($url_parts["host"] . ".");
}
// check if its localhost
if(
filter_var(
$ip,
FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE
) === false
){
return false;
}
}
return true;
}
public function load($template, $replacements = []){ public function load($template, $replacements = []){
$replacements["server_name"] = htmlspecialchars(config::SERVER_NAME); $replacements["server_name"] = htmlspecialchars(config::SERVER_NAME);
@@ -600,16 +646,13 @@ class frontend{
"qwant" => "Qwant", "qwant" => "Qwant",
"ghostery" => "Ghostery", "ghostery" => "Ghostery",
"yep" => "Yep", "yep" => "Yep",
"greppr" => "Greppr",
"crowdview" => "Crowdview",
"mwmbl" => "Mwmbl", "mwmbl" => "Mwmbl",
"mojeek" => "Mojeek", "mojeek" => "Mojeek",
"baidu" => "Baidu", "baidu" => "Baidu",
"coccoc" => "Cốc Cốc", "coccoc" => "Cốc Cốc",
"solofield" => "Solofield", "solofield" => "Solofield",
"marginalia" => "Marginalia", "marginalia" => "Marginalia",
"wiby" => "wiby", "wiby" => "wiby"
"curlie" => "Curlie"
] ]
]; ];
break; break;
@@ -622,11 +665,11 @@ class frontend{
"yandex" => "Yandex", "yandex" => "Yandex",
"brave" => "Brave", "brave" => "Brave",
"google" => "Google", "google" => "Google",
"google_api" => "Google API",
"google_cse" => "Google CSE", "google_cse" => "Google CSE",
"yahoo_japan" => "Yahoo! JAPAN", "yahoo_japan" => "Yahoo! JAPAN",
"startpage" => "Startpage", "startpage" => "Startpage",
"qwant" => "Qwant", "qwant" => "Qwant",
"yep" => "Yep",
"baidu" => "Baidu", "baidu" => "Baidu",
"solofield" => "Solofield", "solofield" => "Solofield",
"pinterest" => "Pinterest", "pinterest" => "Pinterest",
@@ -638,8 +681,7 @@ class frontend{
"fivehpx" => "500px", "fivehpx" => "500px",
"vsco" => "VSCO", "vsco" => "VSCO",
"imgur" => "Imgur", "imgur" => "Imgur",
"ftm" => "FindThatMeme", "ftm" => "FindThatMeme"
//"sankakucomplex" => "SankakuComplex"
] ]
]; ];
break; break;
@@ -678,7 +720,6 @@ class frontend{
"yahoo_japan" => "Yahoo! JAPAN", "yahoo_japan" => "Yahoo! JAPAN",
"startpage" => "Startpage", "startpage" => "Startpage",
"qwant" => "Qwant", "qwant" => "Qwant",
"yep" => "Yep",
"mojeek" => "Mojeek", "mojeek" => "Mojeek",
"baidu" => "Baidu" "baidu" => "Baidu"
] ]
@@ -695,6 +736,22 @@ class frontend{
] ]
]; ];
break; break;
case "booru":
$filters["scraper"] = [
"display" => "Scraper",
"option" => [
"safebooru" => "Safebooru",
"konachan" => "Konachan",
"tbib" => "The Big Imageboard",
"gelbooru" => "Gelbooru",
"yandere" => "Yande.re",
"tbib" => "The Big Imageboard",
"sankakucomplex" => "SankakuComplex",
"soybooru" => "SoyBooru"
]
];
break;
} }
// get scraper name from user input, or default out to preferred scraper // get scraper name from user input, or default out to preferred scraper
@@ -871,6 +928,7 @@ class frontend{
$html = null; $html = null;
//foreach(["web", "images", "videos", "news", "music", "booru"] as $type){
foreach(["web", "images", "videos", "news", "music"] as $type){ foreach(["web", "images", "videos", "news", "music"] as $type){
$html .= '<a href="/' . $type . '?s=' . urlencode($query); $html .= '<a href="/' . $type . '?s=' . urlencode($query);

View File

@@ -553,28 +553,21 @@ class fuckhtml{
case "\"": case "\"":
case "'": case "'":
if( // count preceding backslashes
$i !== 0 && // only check if a quote could be there $bsCount = 0;
( $j = $i - 1;
(
$json[$i - 1] === "\\" && while($j >= 0 && $json[$j] === "\\"){
( $bsCount++;
$i === 2 || $j--;
$json[$i - 2] === "\\" }
)
) || // quote is NOT escaped if even number of backslashes
$json[$i - 1] !== "\\" if($bsCount % 2 === 0){
)
){
// found a non-escaped quote
if($in_quote === null){ if($in_quote === null){
// open quote // open quote
$in_quote = $json[$i]; $in_quote = $json[$i];
}elseif($in_quote === $json[$i]){ }elseif($in_quote === $json[$i]){
// close quote // close quote
$in_quote = null; $in_quote = null;
} }

View File

@@ -347,11 +347,8 @@ class brave{
$q["spellcheck"] = "0"; $q["spellcheck"] = "0";
} }
} }
/*
$handle = fopen("scraper/brave.html", "r");
$html = fread($handle, filesize("scraper/brave.html"));
fclose($handle);*/
//$html = file_get_contents("scraper/brave.html");
try{ try{
$html = $html =
$this->get( $this->get(

View File

@@ -1,145 +0,0 @@
<?php
class crowdview{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("crowdview");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
try{
$json = $this->get(
$proxy,
"https://crowdview-next-js.onrender.com/api/search-v3",
[
"query" => $search
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$json = json_decode($json, true);
if($json === NULL){
throw new Exception("Failed to decode JSON");
}
foreach($json["results"] as $item){
$description = explode("<b>", $item["snippet"], 2);
$out["web"][] = [
"title" => $this->sanitize($item["title"]),
"description" => $this->sanitize($description[1]),
"url" => $item["link"],
"date" => strtotime($description[0]),
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function sanitize($html){
return
trim(
$this->fuckhtml
->getTextContent(
html_entity_decode(
$html
)
),
". "
);
}
}

View File

@@ -1,309 +0,0 @@
<?php
class curlie{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("curlie");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
if($page != "web"){
return [];
}
return [
"lang" => [
"display" => "Language",
"option" => [
"any" => "Any language",
"en" => "English",
"de" => "German",
"fr" => "French",
"ja" => "Japanese",
"it" => "Italian",
"es" => "Spanish",
"ru" => "Russian",
"nl" => "Dutch",
"pl" => "Polish",
"tr" => "Turkish",
"da" => "Danish",
"sv" => "Swedish",
"no" => "Norwegian",
"is" => "Icelandic",
"fo" => "Faroese",
"fi" => "Finnish",
"et" => "Estonian",
"lt" => "Lithuanian",
"lv" => "Latvian",
"cy" => "Welsh",
"ga" => "Irish",
"gd" => "Scottish Gaelic",
"br" => "Breton",
"fy" => "Frisian",
"frr" => "North Frisian",
"gem" => "Saterland Frisian",
"lb" => "Luxembourgish",
"rm" => "Romansh",
"pt" => "Portuguese",
"ca" => "Catalan",
"gl" => "Galician",
"eu" => "Basque",
"ast" => "Asturian",
"an" => "Aragonese",
"fur" => "Friulan",
"sc" => "Sardinian",
"scn" => "Sicilian",
"oc" => "Occitan",
"be" => "Belarusian",
"cs" => "Czech",
"hu" => "Hungarian",
"sk" => "Slovak",
"uk" => "Ukrainian",
"csb" => "Kashubian",
"tt" => "Tatar",
"ba" => "Bashkir",
"os" => "Ossetian",
"sl" => "Slovene",
"sr" => "Serbian",
"hr" => "Croatian",
"bs" => "Bosnian",
"bg" => "Bulgarian",
"sq" => "Albanian",
"ro" => "Romanian",
"mk" => "Macedonian",
"el" => "Greek",
"iw" => "Hebrew",
"fa" => "Persian",
"ar" => "Arabic",
"ku" => "Kurdish",
"az" => "Azerbaijani",
"hy" => "Armenian",
"af" => "Afrikaans",
"sw" => "Kiswahili",
"uz" => "Uzbek",
"kk" => "Kazakh",
"ky" => "Kyrgyz",
"tg" => "Tajik",
"tk" => "Turkmen",
"ug" => "Uyghurche",
"hi" => "Hindi",
"si" => "Sinhalese",
"gu" => "Gujarati",
"ur" => "Urdu",
"mr" => "Marathi",
"pa" => "Punjabi",
"bn" => "Bengali",
"ta" => "Tamil",
"te" => "Telugu",
"kn" => "Kannada",
"zh_CN" => "Chinese Simplified",
"zh_TW" => "Chinese Traditional",
"ko" => "Korean",
"cfr" => "Taiwanese",
"th" => "Thai",
"vi" => "Vietnamese",
"in" => "Indonesian",
"ms" => "Malay",
"tl" => "Tagalog",
"eo" => "Esperanto",
"ia" => "Interlingua",
"la" => "Latin"
]
]
];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$query, $proxy] = $this->backend->get($get["npt"], "web");
try{
$html = $this->get(
$proxy,
"https://curlie.org/" . $query,
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$proxy = $this->backend->get_ip();
$query = [
"q" => $get["s"],
"start" => 0,
"stime" => 92452189 // ?
];
if($get["lang"] !== "any"){
$query["lang"] = $get["lang"];
}
try{
$html = $this->get(
$proxy,
"https://curlie.org/search",
$query
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$this->fuckhtml->load($html);
$nextpage =
$this->fuckhtml
->getElementsByClassName(
"next-page",
"a"
);
if(count($nextpage) !== 0){
$nextpage =
$this->backend->store(
$nextpage[0]["attributes"]["href"],
"web",
$proxy
);
}else{
$nextpage = null;
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => $nextpage,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$items =
$this->fuckhtml
->getElementsByClassName(
"site-item",
"div"
);
foreach($items as $item){
$this->fuckhtml->load($item);
$a =
$this->fuckhtml
->getElementsByAttributeValue(
"target",
"_blank",
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName("site-descr");
if(count($description) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
$description = null;
}
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
}

View File

@@ -12,6 +12,8 @@ class google{
include "lib/backend.php"; include "lib/backend.php";
$this->backend = new backend("google"); $this->backend = new backend("google");
$this->message = "Still working on a Google scraper that uses a headful browser. It will require Firefox + a webExtension running on a dedicated server. Waiting for my EDID adapter and we can get the show going. In the meantime, use the Google CSE/API or Yahoo JP/Startpage scrapers. They're all crippled in their own special ways but they're serviceable I guess.";
} }
public function getfilters($page){ public function getfilters($page){
@@ -505,7 +507,7 @@ class google{
} }
} }
private function get($proxy, $url, $get = [], $alt_ua = false){ private function get($proxy, $url, $get = []){
$curlproc = curl_init(); $curlproc = curl_init();
@@ -518,35 +520,22 @@ class google{
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($alt_ua === true){ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ "User-Agent: " . config::USER_AGENT,
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept: text/html, application/xml;q=0.9, */*;q=0.8", "Accept-Language: en-US,en;q=0.5",
"Accept-Language: en-US,en;q=0.8", "Accept-Encoding: gzip",
"Accept-Encoding: gzip, deflate", "DNT: 1",
"Connection: Keep-Alive", "Connection: keep-alive",
"Cache-Control: no-cache" "Upgrade-Insecure-Requests: 1",
]); "Sec-Fetch-Dest: document",
}else{ "Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); "Sec-Fetch-User: ?1",
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ "Priority: u=1",
"User-Agent: " . config::USER_AGENT, "TE: trailers"
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", ]);
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1",
"Priority: u=1",
"TE: trailers"
]);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
@@ -574,228 +563,22 @@ class google{
public function web($get){ public function web($get){
throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now."); throw new Exception($this->message);
}
public function video($get){
throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
}
public function news($get){
throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
} }
public function image($get){ public function image($get){
throw new Exception($this->message);
// generate parameters }
if($get["npt"]){
[$params, $proxy] = public function video($get){
$this->backend->get( throw new Exception($this->message);
$get["npt"], }
"images"
);
public function news($get){
$params = json_decode($params, true); throw new Exception($this->message);
$page = $params["page"] + 1;
$params = $params["params"];
$params["async"] = "_fmt:json,p:1,ijn:{$page}";
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$country = $get["country"];
$nsfw = $get["nsfw"];
$time = $get["time"];
$size = $get["size"];
$ratio = $get["ratio"];
$color = $get["color"];
$type = $get["type"];
$format = $get["format"];
$rights = $get["rights"];
$page = 0;
$params = [
"q" => $search,
"tbm" => "isch",
"asearch" => "isch",
"async" => "_fmt:json,p:0,ijn:{$page}", // ijn:0 = page 1
];
// country (image search uses cr instead of gl)
if($country != "any"){
$params["cr"] = "country" . strtoupper($country);
}
// nsfw
$params["safe"] = $nsfw == "yes" ? "off" : "active";
// generate tbs
$tbs = [];
// time
if($time != "any"){
$tbs["qdr"] = $time;
}
// size
if($size != "any"){
$params["imgsz"] = $size;
}
// ratio
if($ratio != "any"){
$params["imgar"] = $ratio;
}
// color
if($color != "any"){
if(
$color == "color" ||
$color == "trans"
){
$params["imgc"] = $color;
}elseif($color == "bnw"){
$params["imgc"] = "gray";
}else{
$tbs["ic"] = "specific";
$tbs["isc"] = $color;
}
}
// type
if($type != "any"){
$tbs["itp"] = $type;
}
// format
if($format != "any"){
$params["as_filetype"] = $format;
}
// rights (tbs)
if($rights != "any"){
$tbs["sur"] = $rights;
}
// append tbs
if(count($tbs) !== 0){
$params["tbs"] = "";
foreach($tbs as $key => $value){
$params["tbs"] .= $key . ":" . $value . ",";
}
$params["tbs"] = rtrim($params["tbs"], ",");
}
}
try{
$json =
$this->get(
$proxy,
"https://www.google.com/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get search page");
}
unset($params["async"]);
//$json = file_get_contents("scraper/google.json");
// detect captcha
$this->fuckhtml->load($json);
$this->detect_sorry();
// remove xssi
$json =
preg_replace(
'/^[^{]*/',
"",
$json
);
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if(!isset($json["ischj"]["metadata"])){
throw new Exception("Google did not return an image array");
}
foreach($json["ischj"]["metadata"] as $image){
$out["image"][] = [
"title" => $this->titledots($image["result"]["page_title"]),
"source" => [
[
"url" => $image["original_image"]["url"],
"width" => (int)$image["original_image"]["width"],
"height" => (int)$image["original_image"]["height"]
],
[
"url" => $image["thumbnail"]["url"],
"width" => (int)$image["thumbnail"]["width"],
"height" => (int)$image["thumbnail"]["height"]
]
],
"url" => $image["result"]["referrer_url"]
];
}
$page++;
if(count($out["image"]) === 10){
$out["npt"] =
$this->backend->store(
json_encode([
"params" => $params,
"page" => $page
]),
"images",
$proxy
);
}
return $out;
} }

View File

@@ -264,6 +264,25 @@ class google_api{
"yes" => "Yes", // safe=active "yes" => "Yes", // safe=active
"no" => "No" // safe=off "no" => "No" // safe=off
] ]
],
"sort" => [ // sort
"display" => "Sort by",
"option" => [
"any" => "Any order",
"date:d" => "Oldest",
"date:a" => "Newest"
]
],
"newer" => [
"display" => "Newer than",
"option" => "_DATE"
],
"rm_dupes" => [ // filter
"display" => "Remove duplicates",
"option" => [
"yes" => "Yes", // 1
"no" => "No" // 0
]
] ]
]; ];
@@ -313,109 +332,29 @@ class google_api{
"zh-CN" => "Chinese (Simplified)", "zh-CN" => "Chinese (Simplified)",
"zh-TW" => "Chinese (Traditional)" "zh-TW" => "Chinese (Traditional)"
] ]
],
"sort" => [
"display" => "Sort by",
"option" => [
"any" => "Any order",
"date:d" => "Oldest",
"date:a" => "Newest"
]
],
"newer" => [
"display" => "Newer than",
"option" => "_DATE"
],
"rm_dupes" => [
"display" => "Remove duplicates",
"option" => [
"yes" => "Yes",
"no" => "No"
]
] ]
] ]
); );
break; break;
/*
case "images": case "images":
return array_merge( return array_merge(
$base, $base,
[ [
"time" => [ // tbs=qdr:<time> "size" => [ // imgSize
"display" => "Time posted",
"option" => [
"any" => "Any time",
"d" => "Past 24 hours",
"w" => "Past week",
"m" => "Past month",
"y" => "Past year"
]
],
"size" => [ // imgsz
"display" => "Size", "display" => "Size",
"option" => [ "option" => [
"any" => "Any size", "any" => "Any size",
"l" => "Large", "icon" => "Icon",
"m" => "Medium", "small" => "Small",
"i" => "Icon", "medium" => "Medium",
"qsvga" => "Larger than 400x300", "large" => "Large",
"vga" => "Larger than 640x480", "xlarge" => "X-Large",
"svga" => "Larger than 800x600", "xxlarge" => "XX-Large",
"xga" => "Larger than 1024x768", "huge" => "Huge"
"2mp" => "Larger than 2MP",
"4mp" => "Larger than 4MP",
"6mp" => "Larger than 6MP",
"8mp" => "Larger than 8MP",
"10mp" => "Larger than 10MP",
"12mp" => "Larger than 12MP",
"15mp" => "Larger than 15MP",
"20mp" => "Larger than 20MP",
"40mp" => "Larger than 40MP",
"70mp" => "Larger than 70MP"
] ]
], ],
"ratio" => [ // imgar "format" => [ // fileType
"display" => "Aspect ratio",
"option" => [
"any" => "Any ratio",
"t|xt" => "Tall",
"s" => "Square",
"w" => "Wide",
"xw" => "Panoramic"
]
],
"color" => [ // imgc
"display" => "Color",
"option" => [
"any" => "Any color",
"color" => "Full color",
"bnw" => "Black & white",
"trans" => "Transparent",
// from here, imgcolor
"red" => "Red",
"orange" => "Orange",
"yellow" => "Yellow",
"green" => "Green",
"teal" => "Teal",
"blue" => "Blue",
"purple" => "Purple",
"pink" => "Pink",
"white" => "White",
"gray" => "Gray",
"black" => "Black",
"brown" => "Brown"
]
],
"type" => [ // tbs=itp:<type>
"display" => "Type",
"option" => [
"any" => "Any type",
"clipart" => "Clip Art",
"lineart" => "Line Drawing",
"animated" => "Animated"
]
],
"format" => [ // as_filetype
"display" => "Format", "display" => "Format",
"option" => [ "option" => [
"any" => "Any format", "any" => "Any format",
@@ -429,17 +368,55 @@ class google_api{
"craw" => "RAW" "craw" => "RAW"
] ]
], ],
"rights" => [ // tbs=sur:<rights> "color" => [
"display" => "Color",
"option" => [
"any" => "Any color",
"color" => "Full color", // imgColorType
"mono" => "Black & White",
"trans" => "Transparent background",
"red" => "Red", // imgDominantColor
"orange" => "Orange",
"yellow" => "Yellow",
"green" => "Green",
"teal" => "Teal",
"blue" => "Blue",
"purple" => "Purple",
"pink" => "Pink",
"white" => "White",
"gray" => "Gray",
"black" => "Black",
"brown" => "Brown"
]
],
"type" => [ // imgType
"display" => "Type",
"option" => [
"any" => "Any type",
"clipart" => "Clip Art",
"face" => "Faces",
"lineart" => "Line Drawing",
"stock" => "Stock photos",
"photo" => "Photos",
"animated" => "Animated",
]
],
"rights" => [ // rights
"display" => "Usage rights", "display" => "Usage rights",
"option" => [ "option" => [
"any" => "Any license", "any" => "Any license",
"cl" => "Creative Commons licenses", "cc_publicdomain" => "Public domain",
"ol" => "Commercial & other licenses" "cc_attribute" => "Attribution required",
"cc_sharealike" => "Sharealike",
"cc_noncommercial" => "Non-commercial use only",
"cc_nonderived" => "Original works"
] ]
] ]
] ]
); );
break;*/ break;
} }
} }
@@ -485,6 +462,7 @@ class google_api{
return $data; return $data;
} }
public function web($get){ public function web($get){
// rotate proxy + key on EVERY request // rotate proxy + key on EVERY request
@@ -731,6 +709,160 @@ class google_api{
return $out; return $out;
} }
public function image($get){
// rotate proxy + key on EVERY request
$keydata = $this->backend->get_key();
$proxy = $this->backend->get_ip($keydata["increment"]);
if($get["npt"]){
// $p is never used
[$params, $p] = $this->backend->get(
$get["npt"],
"web"
);
$params = json_decode($params, true);
$params["key"] = $keydata["key"];
}else{
//$json = file_get_contents("scraper/google.json");
$params = [
"q" => $get["s"],
"cx" => config::GOOGLE_CX_ENDPOINT,
"num" => 10,
"start" => 1,
"searchType" => "image",
"key" => $keydata["key"]
];
//
// parse filters
//
if($get["newer"] !== false){
$params["dateRestrict"] = "d" . (round((time() - $get["newer"]) / 100000));
}
if($get["rm_dupes"] == "no"){ $params["filter"] = "0"; }
if($get["country"] != "any"){ $params["gl"] = $get["country"]; }
if($get["nsfw"] == "yes"){
$params["safe"] = "off";
}else{
$params["safe"] = "active";
}
if($get["sort"] != "any"){ $params["sort"] = $get["sort"]; }
// image filters
if($get["size"] != "any"){ $params["imgSize"] = $get["size"]; }
if($get["format"] != "any"){ $params["fileType"] = $get["format"]; }
switch($get["color"]){
case "any":
break;
case "color":
case "mono":
case "trans":
$params["imgColorType"] = $get["color"];
break;
default:
$params["imgDominantColor"] = $get["color"];
break;
}
if($get["type"] != "any"){ $params["imgType"] = $get["type"]; }
if($get["rights"] != "any"){ $params["rights"] = $get["rights"]; }
}
try{
$json =
$this->get(
$proxy,
"https://www.googleapis.com/customsearch/v1",
$params
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if(isset($json["error"]["message"])){
throw new Exception(
"API returned an error: " .
$json["error"]["message"] .
" (key #" . $keydata["increment"] . ")"
);
}
if(!isset($json["items"])){
// google just doesnt return items when theres no results
return $out;
}
foreach($json["items"] as $image){
$out["image"][] = [
"title" => $this->titledots($image["title"]),
"source" => [
[
"url" => $image["link"],
"width" => (int)$image["image"]["width"],
"height" => (int)$image["image"]["height"]
],
[
"url" => $image["image"]["thumbnailLink"],
"width" => (int)$image["image"]["thumbnailWidth"],
"height" => (int)$image["image"]["thumbnailHeight"]
]
],
"url" => $image["image"]["contextLink"]
];
}
// get npt
if(isset($json["queries"]["nextPage"][0]["startIndex"])){
unset($params["key"]);
$params["start"] = (int)$json["queries"]["nextPage"][0]["startIndex"];
$out["npt"] =
$this->backend->store(
json_encode($params),
"web",
$proxy
);
}
return $out;
}
private function titledots($title){ private function titledots($title){
return trim($title, " .\t\n\r\0\x0B"); return trim($title, " .\t\n\r\0\x0B");

View File

@@ -1,452 +0,0 @@
<?php
// greppr dev probably monitors 4get code, lol
// hello greppr dude, add an API you moron
class greppr{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("greppr");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = [], $cookies = [], $post = false){
$curlproc = curl_init();
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
$cookie = [];
foreach($cookies as $k => $v){
$cookie[] = "{$k}={$v}";
}
$cookie = implode("; ", $cookie);
if($post === false){
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
if($cookie == ""){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Referer: https://greppr.org/search",
"Cookie: {$cookie}",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i"]
);
}
}else{
$get = http_build_query($get);
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"Content-Type: application/x-www-form-urlencoded",
"Content-Length: " . strlen($get),
"Origin: https://greppr.org",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Referer: https://greppr.org/",
"Cookie: {$cookie}",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$headers = [];
curl_setopt(
$curlproc,
CURLOPT_HEADERFUNCTION,
function($curlproc, $header) use (&$headers){
$len = strlen($header);
$header = explode(':', $header, 2);
if(count($header) < 2){
// ignore invalid headers
return $len;
}
$headers[strtolower(trim($header[0]))][] = trim($header[1]);
return $len;
}
);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return [
"headers" => $headers,
"data" => $data
];
}
public function web($get, $first_attempt = true){
if($get["npt"]){
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$tokens = json_decode($q, true);
//
// Get paginated page
//
try{
$html = $this->get(
$proxy,
"https://greppr.org" . $tokens["get"],
[],
$tokens["cookies"],
false
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
//
// get token
//
try{
$html =
$this->get(
$proxy,
"https://greppr.org",
[],
[],
false
);
}catch(Exception $error){
throw new Exception("Failed to fetch homepage");
}
//
// Parse token
//
$this->fuckhtml->load($html["data"]);
$tokens = [
"req" => null,
"data" => null,
"cookies" => null
];
$inputs =
$this->fuckhtml
->getElementsByTagName(
"input"
);
foreach($inputs as $input){
if(!isset($input["attributes"]["name"])){
continue;
}
if(
isset($input["attributes"]["value"]) &&
!empty($input["attributes"]["value"])
){
$tokens
["data"]
[$this->fuckhtml
->getTextContent(
$input["attributes"]["name"]
)] =
$this->fuckhtml
->getTextContent(
$input["attributes"]["value"]
);
}else{
$tokens["req"] =
$this->fuckhtml
->getTextContent(
$input["attributes"]["name"]
);
}
}
if($tokens["req"] === null){
throw new Exception("Failed to get request ID");
}
if(isset($html["headers"]["set-cookie"])){
foreach($html["headers"]["set-cookie"] as $cookie){
if(
preg_match(
'/([^=]+)=([^;]+)/',
$cookie,
$matches
)
){
$tokens["cookies"][$matches[1]] = $matches[2];
}
}
}
//
// Get initial search page
//
$tokens_req = $tokens["data"];
$tokens_req[$tokens["req"]] = $search;
try{
$html = $this->get(
$proxy,
"https://greppr.org/search",
$tokens_req,
$tokens["cookies"],
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
//$html = file_get_contents("scraper/greppr.html");
//$this->fuckhtml->load($html);
$this->fuckhtml->load($html["data"]);
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// get results for later
$results =
$this->fuckhtml
->getElementsByClassName(
"result",
"div"
);
// check for next page
$next_elem =
$this->fuckhtml
->getElementsByClassName(
"pagination",
"ul"
);
if(count($next_elem) !== 0){
$this->fuckhtml->load($next_elem[0]);
$as =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$break = false;
foreach($as as $a){
if($break === true){
$out["npt"] =
$this->backend->store(
json_encode([
"get" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"cookies" => $tokens["cookies"]
]),
"web",
$proxy
);
break;
}
if($a["attributes"]["href"] == "#"){
$break = true;
}
}
}
// scrape results
foreach($results as $result){
$this->fuckhtml->load($result);
$a =
$this->fuckhtml
->getElementsByTagName(
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName(
"highlightedDesc",
"p"
);
if(count($description) === 0){
$description = null;
}else{
$description =
$this->limitstrlen(
$this->fuckhtml
->getTextContent(
$description[0]
)
);
}
$date =
$this->fuckhtml
->getElementsByTagName(
"p"
);
$date =
strtotime(
explode(
":",
$this->fuckhtml
->getTextContent(
$date[count($date) - 1]["innerHTML"]
)
)[1]
);
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a["innerHTML"]
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"date" => $date,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function limitstrlen($text){
return explode("\n", wordwrap($text, 300, "\n"))[0];
}
}

View File

@@ -297,6 +297,14 @@ class pinterest{
throw new Exception("Failed to decode JSON"); throw new Exception("Failed to decode JSON");
} }
if(
isset($json["client_context"]["is_bad_bot"]) &&
(int)$json["client_context"]["is_bad_bot"] === 1
){
throw new Exception("Pinterest blocked this instance or request proxy.");
}
$out = [ $out = [
"status" => "ok", "status" => "ok",
"npt" => null, "npt" => null,
@@ -426,7 +434,7 @@ class pinterest{
] ]
], ],
"url" => "url" =>
$item["link"] === null ? !isset($item["link"]) ?
"https://ca.pinterest.com/pin/" . $item["id"] : "https://ca.pinterest.com/pin/" . $item["id"] :
$item["link"] $item["link"]
]; ];

View File

@@ -564,12 +564,16 @@ class startpage{
break; break;
case "spellsuggest-google": case "spellsuggest-google":
$out["spelling"] =
[ if(isset($category["results"][0]["query"])){
"type" => "including",
"using" => $json["render"]["query"], $out["spelling"] =
"correction" => $category["results"][0]["query"] [
]; "type" => "including",
"using" => $json["render"]["query"],
"correction" => urldecode($category["results"][0]["query"])
];
}
break; break;
case "dictionary-qi": case "dictionary-qi":
@@ -645,318 +649,6 @@ class startpage{
} }
} }
// parse instant answers
if(
$get["extendedsearch"] == "yes" &&
$get_instant_answer === true
){
// https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1
try{
$post = [
"se" => "n0vze2y9dqwy",
"q" => $json["render"]["query"],
"results" => [], // populate
"enableKnowledgePanel" => true,
"enableMediaThumbBar" => false,
"enableSearchSuggestions" => false,
"enableTripadvisorProperties" => [],
"enableTripadvisorPlaces" => [],
"enableTripadvisorPlacesForLocations" => [],
"enableWebProducts" => false,
"tripadvisorPartnerId" => null,
"tripadvisorMapColorMode" => "light",
"tripadvisorDisablesKnowledgePanel" => false,
"instantAnswers" => [
"smartAnswers",
"youtube",
"tripadvisor"
],
"iaType" => null,
"forceEnhancedKnowledgePanel" => false,
"shoppingOnly" => false,
"allowAdultProducts" => true,
"lang" => "en",
"browserLang" => "en-US",
"browserTimezone" => "America/New_York",
"market" => null,
"userLocation" => null,
"userDate" => date("Y-m-d"),
"userAgentType" => "unknown"
];
foreach($out["web"] as $result){
$post["results"][] = [
"url" => $result["url"],
"title" => $result["title"]
];
}
$post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE);
$additional_data =
$this->get(
$proxy,
"https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1",
$post,
true,
true
);
$additional_data = json_decode($additional_data, true);
if($additional_data === null){
throw new Exception("Failed to decode JSON"); // just break out, dont fail completely
}
if(!isset($additional_data["knowledgePanel"])){
throw new Exception("Response has missing data (knowledgePanel)");
}
$additional_data = $additional_data["knowledgePanel"];
$answer = [
"title" => $additional_data["meta"]["title"],
"description" => [
[
"type" => "quote",
"value" => $additional_data["meta"]["description"]
]
],
"url" => $additional_data["meta"]["origWikiUrl"],
"thumb" => $additional_data["meta"]["image"],
"table" => [],
"sublink" => []
];
// parse html for instant answer
$this->fuckhtml->load($additional_data["html"]);
$div =
$this->fuckhtml
->getElementsByTagName(
"div"
);
// get description
$description =
$this->fuckhtml
->getElementsByClassName(
"sx-kp-short-extract sx-kp-short-extract-complete",
$div
);
if(count($description) !== 0){
$answer["description"][] = [
"type" => "text",
"value" =>
html_entity_decode(
$this->fuckhtml
->getTextContent(
$description[0]
)
)
];
}
// get socials
$socials =
$this->fuckhtml
->getElementsByClassName(
"sx-wiki-social-link",
"a"
);
foreach($socials as $social){
$title =
$this->fuckhtml
->getTextContent(
$social["attributes"]["title"]
);
$url =
$this->fuckhtml
->getTextContent(
$social["attributes"]["href"]
);
switch($title){
case "Official Website":
$title = "Website";
break;
}
$answer["sublink"][$title] = $url;
}
// get videos
$videos =
$this->fuckhtml
->getElementsByClassName(
"sx-kp-video-grid-item",
$div
);
foreach($videos as $video){
$this->fuckhtml->load($video);
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(count($as) === 0){
// ?? invalid
continue;
}
$image =
$this->fuckhtml
->getElementsByAttributeName(
"data-sx-src",
"img"
);
if(count($image) !== 0){
$thumb = [
"ratio" => "16:9",
"url" =>
$this->fuckhtml
->getTextContent(
$image[0]["attributes"]["data-sx-src"]
)
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
$out["video"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$as[0]["attributes"]["title"]
),
"description" => null,
"date" => null,
"duration" => null,
"views" => null,
"thumb" => $thumb,
"url" =>
$this->fuckhtml
->getTextContent(
$as[0]["attributes"]["href"]
)
];
}
// reset
$this->fuckhtml->load($additional_data["html"]);
// get table elements
$table =
$this->fuckhtml
->getElementsByClassName(
"sx-infobox",
"table"
);
if(count($table) !== 0){
$trs =
$this->fuckhtml
->getElementsByTagName(
"tr"
);
foreach($trs as $tr){
$this->fuckhtml->load($tr);
// ok so startpage devs cant fucking code a table
// td = content
// th (AAAHH) = title
$tds =
$this->fuckhtml
->getElementsByTagName(
"td"
);
$ths =
$this->fuckhtml
->getElementsByTagName(
"th"
);
if(
count($ths) === 1 &&
count($tds) === 1
){
$title =
$this->fuckhtml
->getTextContent(
$ths[0]
);
$description = [];
$this->fuckhtml->load($tds[0]);
$lis =
$this->fuckhtml
->getElementsByTagName(
"li"
);
if(count($lis) !== 0){
foreach($lis as $li){
$description[] =
$this->fuckhtml
->getTextContent(
$li
);
}
$description = implode(", ", $description);
}else{
$description =
$this->fuckhtml
->getTextContent(
$tds[0]
);
}
$answer["table"][$title] = $description;
}
}
}
$out["answer"][] = $answer;
}catch(Exception $error){
// do nothing
//echo "error!";
}
}
return $out; return $out;
} }
@@ -1428,12 +1120,16 @@ class startpage{
[ [
"lui" => "english", "lui" => "english",
"language" => "english", "language" => "english",
"query" => $str["q"],
"cat" => $pagetype,
"sc" => $str["sc"], "sc" => $str["sc"],
"t" => "device", "t" => "device",
"cat" => $pagetype,
"segment" => "startpage.udog", "segment" => "startpage.udog",
"page" => $str["page"] "abd" => 0,
"abe" => 0,
"query" => $str["q"],
"page" => $str["page"],
"qsr" => "all",
"qadf" => "none" // @ todo fix (??)
] ]
), ),
$pagetype, $pagetype,

View File

@@ -868,123 +868,71 @@ class yandex{
if($get["npt"]){ if($get["npt"]){
[$params, $proxy] = [$get, $proxy] =
$this->backend->get( $this->backend->get(
$get["npt"], $get["npt"],
"video" "video"
); );
$params = json_decode($params, true); $get = json_decode($get, true);
$nsfw = $params["nsfw"];
unset($params["nsfw"]);
}else{ }else{
$search = $get["s"]; if(strlen($get["s"]) === 0){
if(strlen($search) === 0){
throw new Exception("Search term is empty!"); throw new Exception("Search term is empty!");
} }
$proxy = $this->backend->get_ip(); $proxy = $this->backend->get_ip();
$nsfw = $get["nsfw"];
$time = $get["time"];
$duration = $get["duration"];
// https://yandex.com/video/search
// ?tmpl_version=releases/frontend/video/v1.1168.0#8d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63
// &format=json
// &request=
// {
// "blocks":[
// {"block":"extra-content","params":{},"version":2},
// {"block":"i-global__params:ajax","params":{},"version":2},
// {"block":"search2:ajax","params":{},"version":2},
// {"block":"vital-incut","params":{},"version":2},
// {"block":"content_type_search","params":{},"version":2},
// {"block":"serp-controller","params":{},"version":2},
// {"block":"cookies_ajax","params":{},"version":2}
// ],
// "metadata":{
// "bundles":{"lb":"^G]!q<X120"},
// "assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"},
// "extraContent":{"names":["i-react-ajax-adapter"]}
// }
// }
// &yu=4861394161661655015
// &from=tabbar
// &reqid=1693106278500184-6825210746979814879-balancer-l7leveler-kubr-yp-sas-7-BAL-4237
// &suggest_reqid=486139416166165501562797413447032
// &text=minecraft
$params = [
"tmpl_version" => "releases/frontend/video/v1.1168.0#8d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63",
"format" => "json",
"request" => json_encode([
"blocks" => [
(object)[
"block" => "extra-content",
"params" => (object)[],
"version" => 2
],
(object)[
"block" => "i-global__params:ajax",
"params" => (object)[],
"version" => 2
],
(object)[
"block" => "search2:ajax",
"params" => (object)[],
"version" => 2
],
(object)[
"block" => "vital-incut",
"params" => (object)[],
"version" => 2
],
(object)[
"block" => "content_type_search",
"params" => (object)[],
"version" => 2
],
(object)[
"block" => "serp-controller",
"params" => (object)[],
"version" => 2
],
(object)[
"block" => "cookies_ajax",
"params" => (object)[],
"version" => 2
]
],
"metadata" => (object)[
"bundles" => (object)[
"lb" => "^G]!q<X120"
],
"assets" => (object)[
"las" => "react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"
],
"extraContent" => (object)[
"names" => [
"i-react-ajax-adapter"
]
]
]
]),
"text" => $search
];
if($duration != "any"){
$params["duration"] = $duration;
}
if($time != "any"){
$params["within"] = $time;
}
} }
// https://yandex.com/video/search?text=skycamefalling&from=tabbar&format=json&ncrnd=7271&p=0&parent-reqid=&request={%22blocks%22%3A[{%22block%22%3A%22video-app%22%2C%22params%22%3A{}}]}&serpid=1777751040971457-16832445014469941403-balancer-l7leveler-kubr-yp-klg-151-BAL&yu=3091577281773194415&tmpl_version=releases-frontend-video-v1.1816.0__3bdc24e10a8a138a1194877428e220a3ca0dbc5a
// https://yandex.com/video/search
// ?text=skycamefalling
// &from=tabbar
// &format=json
// &ncrnd=7271
// &p=0
// &parent-reqid=
// &request={%22blocks%22%3A[{%22block%22%3A%22video-app%22%2C%22params%22%3A{}}]} {"blocks":[{"block":"video-app","params":{}}]}
// &serpid=1777751040971457-16832445014469941403-balancer-l7leveler-kubr-yp-klg-151-BAL
// &yu=3091577281773194415
// &tmpl_version=releases-frontend-video-v1.1816.0__3bdc24e10a8a138a1194877428e220a3ca0dbc5a
$params = [
"text" => $get["s"],
"from" => "tabbar",
"format" => "json",
"ncrnd" => 7271,
"p" => 0,
"parent-reqid" => "",
"request" => json_encode((object)[
"blocks" => [
(object)[
"block" => "video-app",
"params" => (object)[]
]
]
]),
"serpid" => "1777751040971457-16832445014469941403-balancer-l7leveler-kubr-yp-klg-151-BAL",
"yu" => 3091577281773194415,
"tmpl_version" => "releases-frontend-video-v1.1816.0__3bdc24e10a8a138a1194877428e220a3ca0dbc5a"
];
if(isset($get["p"])){
$params["p"] = $get["p"];
}
if($get["duration"] != "any"){
$params["duration"] = $get["duration"];
}
if($get["time"] != "any"){
$params["within"] = $get["time"];
}
/* /*
$handle = fopen("scraper/yandex-video.json", "r"); $handle = fopen("scraper/yandex-video.json", "r");
$json = fread($handle, filesize("scraper/yandex-video.json")); $json = fread($handle, filesize("scraper/yandex-video.json"));
@@ -996,7 +944,7 @@ class yandex{
$proxy, $proxy,
"https://yandex.com/video/search", "https://yandex.com/video/search",
$params, $params,
$nsfw, $get["nsfw"],
"yandex_v" "yandex_v"
); );
}catch(Exception $error){ }catch(Exception $error){
@@ -1011,7 +959,7 @@ class yandex{
throw new Exception("Could not parse JSON"); throw new Exception("Could not parse JSON");
} }
if(!isset($json["blocks"])){ if(!isset($json["results"]["clips"]["items"])){
throw new Exception("Yandex blocked this 4get instance. Please try again in 7~ minutes."); throw new Exception("Yandex blocked this 4get instance. Please try again in 7~ minutes.");
} }
@@ -1026,209 +974,120 @@ class yandex{
"reel" => [] "reel" => []
]; ];
$html = null; foreach($json["results"]["clips"]["items"] as $k => $data){
foreach($json["blocks"] as $block){
if(isset($block["html"])){ if(isset($data["preview"]["posterSrc"])){
$html .= $block["html"]; $poster = $data["preview"]["posterSrc"];
if(
preg_match(
'/^\/\//',
$data["preview"]["posterSrc"]
)
){
$poster = "https:" . $poster;
}
$thumb = [
"ratio" => "16:9",
"url" => $poster
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
} }
$out["video"][] = [
"title" => $data["relatedParams"]["text"],
"description" => $this->titledots($data["description"]),
"author" => [
"name" =>
isset($json["results"]["clips"]["dups"][$k]["host"]["secondPart"]["name"]) ?
$json["results"]["clips"]["dups"][$k]["host"]["secondPart"]["name"] : null,
"url" =>
isset($json["results"]["clips"]["dups"][$k]["host"]["secondPart"]["origUrl"]) ?
$json["results"]["clips"]["dups"][$k]["host"]["secondPart"]["origUrl"] : null,
"avatar" => null
],
"date" =>
isset($json["results"]["clips"]["dups"][$k]["date"]) ?
strtotime($json["results"]["clips"]["dups"][$k]["date"]) : null,
"duration" =>
isset($json["results"]["clips"]["dups"][$k]["duration"]["value"]) ?
(int)$json["results"]["clips"]["dups"][$k]["duration"]["value"] : null,
"views" =>
isset($json["results"]["clips"]["dups"][$k]["views"]["text"]) ?
$this->parseviews($json["results"]["clips"]["dups"][$k]["views"]["text"]) : null,
"thumb" => $thumb,
"url" =>
preg_replace(
'/^http:\/\//',
"https://",
$data["relatedParams"]["related_url"]
)
];
} }
$this->fuckhtml->load($html); // get npt
if($json["results"]["search"]["hasNextPage"]){
$div =
$this->fuckhtml $get["p"] = (int)$json["results"]["search"]["currentPage"] + 1;
->getElementsByTagName("div");
/*
Get nextpage
*/
$npt =
$this->fuckhtml
->getElementsByClassName(
"more more_direction_next i-bem",
$div
);
if(count($npt) !== 0){
$params["p"] = "1";
$params["nsfw"] = $nsfw;
$out["npt"] = $out["npt"] =
$this->backend->store( $this->backend->store(
json_encode($params), json_encode($get),
"video", "video",
$proxy $proxy
); );
} }
$items =
$this->fuckhtml
->getElementsByClassName(
"serp-item",
$div
);
foreach($items as $item){
$data =
json_decode(
$this->fuckhtml
->getTextContent(
$item["attributes"]["data-video"]
),
true
);
$this->fuckhtml->load($item);
$thumb =
$this->fuckhtml
->getElementsByClassName(
"thumb-image__image",
"img"
);
$c = 1;
if(count($thumb) === 0){
$thumb = [
"url" => null,
"ratio" => null
];
}else{
$thumb = [
"url" =>
str_replace(
"//",
"https://",
$this->fuckhtml
->getTextContent(
$thumb
[0]
["attributes"]
["src"]
),
$c
),
"ratio" => "16:9"
];
}
$smallinfos =
$this->fuckhtml
->getElementsByClassName(
"serp-item__sitelinks-item",
"div"
);
$date = null;
$views = null;
$first = true;
foreach($smallinfos as $info){
if($first){
$first = false;
continue;
}
$info =
$this->fuckhtml
->getTextContent(
$info
);
if($temp_date = strtotime($info)){
$date = $temp_date;
}else{
$views = $this->parseviews($info);
}
}
$description =
$this->fuckhtml
->getElementsByClassName(
"serp-item__text serp-item__text_visibleText_always",
"div"
);
if(count($description) === 0){
$description = null;
}else{
$description =
$this->titledots(
$this->fuckhtml
->getTextContent(
$description[0]
)
);
}
$out["video"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$this->titledots(
$data["title"]
)
),
"description" => $description,
"author" => [
"name" => null,
"url" => null,
"avatar" => null
],
"date" => $date,
"duration" =>
(int)$data
["counters"]
["toHostingLoaded"]
["stredParams"]
["duration"],
"views" => $views,
"thumb" => $thumb,
"url" =>
str_replace(
"http://",
"https://",
$this->fuckhtml
->getTextContent(
$data["counters"]
["toHostingLoaded"]
["postfix"]
["href"]
),
$c
)
];
}
return $out; return $out;
} }
private function parseviews($text){ private function parseviews($number){
$text = explode(" ", $text); // decimal should always be 1 number long
$number = explode(" ", $number, 2);
$number = $number[0];
$num = (float)$text[0]; $unit = strtolower($number[strlen($number) - 1]);
$mod = $text[1];
switch($mod){ $tmp = explode(".", $number, 2);
$number = (int)$number;
if(count($tmp) === 2){
case "bln.": $num = $num * 1000000000; break; $decimal = (int)$tmp[1];
case "mln.": $num = $num * 1000000; break; }else{
case "thsd.": $num = $num * 1000; break;
$decimal = 0;
} }
return $num; switch($unit){
case "k":
$exponant = 1000;
break;
case "m":
$exponant = 1000000;
break;
case "b";
$exponant = 1000000000;
break;
default:
$exponant = 1;
break;
}
return ($number * $exponant) + ($decimal * ($exponant / 10));
} }
private function titledots($title){ private function titledots($title){

View File

@@ -14,234 +14,209 @@ class yep{
public function getfilters($page){ public function getfilters($page){
return [ return [
"country" => [ "lang" => [
"display" => "Country", "display" => "Language",
"option" => [ "option" => [
"all" => "All regions", "any" => "Any language",
"af" => "Afghanistan", "aa" => "Afar",
"al" => "Albania", "ab" => "Abkhazian",
"dz" => "Algeria", "ae" => "Avestan",
"as" => "American Samoa", "af" => "Afrikaans",
"ad" => "Andorra", "ak" => "Akan",
"ao" => "Angola", "am" => "Amharic",
"ai" => "Anguilla", "an" => "Aragonese",
"ag" => "Antigua and Barbuda", "ar" => "Arabic",
"ar" => "Argentina", "as" => "Assamese",
"am" => "Armenia", "av" => "Avaric",
"aw" => "Aruba", "ay" => "Aymara",
"au" => "Australia", "az" => "Azerbaijani",
"at" => "Austria", "ba" => "Bashkir",
"az" => "Azerbaijan", "be" => "Belarusian",
"bs" => "Bahamas", "bg" => "Bulgarian",
"bh" => "Bahrain", "bh" => "Bihari",
"bd" => "Bangladesh", "bi" => "Bislama",
"bb" => "Barbados", "bm" => "Bambara",
"by" => "Belarus", "bn" => "Bengali",
"be" => "Belgium", "bo" => "Tibetan",
"bz" => "Belize", "br" => "Breton",
"bj" => "Benin", "bs" => "Bosnian",
"bt" => "Bhutan", "ca" => "Catalan",
"bo" => "Bolivia", "ce" => "Chechen",
"ba" => "Bosnia and Herzegovina", "ch" => "Chamorro",
"bw" => "Botswana", "co" => "Corsican",
"br" => "Brazil", "cr" => "Cree",
"bn" => "Brunei Darussalam", "cs" => "Czech",
"bg" => "Bulgaria", "cu" => "Church Slavic",
"bf" => "Burkina Faso", "cv" => "Chuvash",
"bi" => "Burundi", "cy" => "Welsh",
"cv" => "Cabo Verde", "da" => "Danish",
"kh" => "Cambodia", "de" => "German",
"cm" => "Cameroon", "dv" => "Divehi",
"ca" => "Canada", "dz" => "Dzongkha",
"ky" => "Cayman Islands", "ee" => "Ewe",
"cf" => "Central African Republic", "el" => "Greek",
"td" => "Chad", "en" => "English",
"cl" => "Chile", "eo" => "Esperanto",
"cn" => "China", "es" => "Spanish",
"co" => "Colombia", "et" => "Estonian",
"cg" => "Congo", "eu" => "Basque",
"cd" => "Congo, Democratic Republic", "fa" => "Persian",
"ck" => "Cook Islands", "ff" => "Fulah",
"cr" => "Costa Rica", "fi" => "Finnish",
"hr" => "Croatia", "fj" => "Fijian",
"cu" => "Cuba", "fo" => "Faroese",
"cy" => "Cyprus", "fr" => "French",
"cz" => "Czechia", "fy" => "Western Frisian",
"ci" => "Côte d'Ivoire", "ga" => "Irish",
"dk" => "Denmark", "gd" => "Scottish Gaelic",
"dj" => "Djibouti", "gl" => "Galician",
"dm" => "Dominica", "gn" => "Guarani",
"do" => "Dominican Republic", "gu" => "Gujarati",
"ec" => "Ecuador", "gv" => "Manx",
"eg" => "Egypt", "ha" => "Hausa",
"sv" => "El Salvador", "he" => "Hebrew",
"gq" => "Equatorial Guinea", "hi" => "Hindi",
"ee" => "Estonia", "ho" => "Hiri Motu",
"et" => "Ethiopia", "hr" => "Croatian",
"fo" => "Faroe Islands", "ht" => "Haitian",
"fj" => "Fiji", "hu" => "Hungarian",
"fi" => "Finland", "hy" => "Armenian",
"fr" => "France", "hz" => "Herero",
"gf" => "French Guiana", "ia" => "Interlingua",
"pf" => "French Polynesia", "id" => "Indonesian",
"ga" => "Gabon", "ie" => "Interlingue",
"gm" => "Gambia", "ig" => "Igbo",
"ge" => "Georgia", "ii" => "Sichuan Yi",
"de" => "Germany", "ik" => "Inupiaq",
"gh" => "Ghana", "io" => "Ido",
"gi" => "Gibraltar", "is" => "Icelandic",
"gr" => "Greece", "it" => "Italian",
"gl" => "Greenland", "iu" => "Inuktitut",
"gd" => "Grenada", "ja" => "Japanese",
"gp" => "Guadeloupe", "jv" => "Javanese",
"gu" => "Guam", "ka" => "Georgian",
"gt" => "Guatemala", "kg" => "Kongo",
"gg" => "Guernsey", "ki" => "Kikuyu",
"gn" => "Guinea", "kj" => "Kuanyama",
"gy" => "Guyana", "kk" => "Kazakh",
"ht" => "Haiti", "kl" => "Kalaallisut",
"hn" => "Honduras", "km" => "Central Khmer",
"hk" => "Hong Kong", "kn" => "Kannada",
"hu" => "Hungary", "ko" => "Korean",
"is" => "Iceland", "kr" => "Kanuri",
"in" => "India", "ks" => "Kashmiri",
"id" => "Indonesia", "ku" => "Kurdish",
"iq" => "Iraq", "kv" => "Komi",
"ie" => "Ireland", "kw" => "Cornish",
"im" => "Isle of Man", "ky" => "Kyrgyz",
"il" => "Israel", "la" => "Latin",
"it" => "Italy", "lb" => "Luxembourgish",
"jm" => "Jamaica", "lg" => "Ganda",
"jp" => "Japan", "li" => "Limburgish",
"je" => "Jersey", "ln" => "Lingala",
"jo" => "Jordan", "lo" => "Lao",
"kz" => "Kazakhstan", "lt" => "Lithuanian",
"ke" => "Kenya", "lu" => "Luba-Katanga",
"ki" => "Kiribati", "lv" => "Latvian",
"kw" => "Kuwait", "mg" => "Malagasy",
"kg" => "Kyrgyzstan", "mh" => "Marshallese",
"la" => "Lao People's Democratic Republic", "mi" => "Maori",
"lv" => "Latvia", "mk" => "Macedonian",
"lb" => "Lebanon", "ml" => "Malayalam",
"ls" => "Lesotho", "mn" => "Mongolian",
"ly" => "Libya", "mr" => "Marathi",
"li" => "Liechtenstein", "ms" => "Malay",
"lt" => "Lithuania", "mt" => "Maltese",
"lu" => "Luxembourg", "my" => "Burmese",
"mk" => "Macedonia", "na" => "Nauru",
"mg" => "Madagascar", "nb" => "Norwegian Bokmål",
"mw" => "Malawi", "nd" => "North Ndebele",
"my" => "Malaysia", "ne" => "Nepali",
"mv" => "Maldives", "ng" => "Ndonga",
"ml" => "Mali", "nl" => "Dutch",
"mt" => "Malta", "nn" => "Norwegian Nynorsk",
"mq" => "Martinique", "no" => "Norwegian",
"mr" => "Mauritania", "nr" => "South Ndebele",
"mu" => "Mauritius", "nv" => "Navajo",
"yt" => "Mayotte", "ny" => "Chichewa",
"mx" => "Mexico", "oc" => "Occitan",
"fm" => "Micronesia, Federated States of", "oj" => "Ojibwa",
"md" => "Moldova", "om" => "Oromo",
"mc" => "Monaco", "or" => "Oriya",
"mn" => "Mongolia", "os" => "Ossetian",
"me" => "Montenegro", "pa" => "Punjabi",
"ms" => "Montserrat", "pi" => "Pali",
"ma" => "Morocco", "pl" => "Polish",
"mz" => "Mozambique", "ps" => "Pashto",
"mm" => "Myanmar", "pt" => "Portuguese",
"na" => "Namibia", "qu" => "Quechua",
"nr" => "Nauru", "rm" => "Romansh",
"np" => "Nepal", "rn" => "Rundi",
"nl" => "Netherlands", "ro" => "Romanian",
"nc" => "New Caledonia", "ru" => "Russian",
"nz" => "New Zealand", "rw" => "Kinyarwanda",
"ni" => "Nicaragua", "sa" => "Sanskrit",
"ne" => "Niger", "sc" => "Sardinian",
"ng" => "Nigeria", "sd" => "Sindhi",
"nu" => "Niue", "se" => "Northern Sami",
"no" => "Norway", "sg" => "Sango",
"om" => "Oman", "si" => "Sinhala",
"pk" => "Pakistan", "sk" => "Slovak",
"ps" => "Palestine, State of", "sl" => "Slovenian",
"pa" => "Panama", "sm" => "Samoan",
"pg" => "Papua New Guinea", "sn" => "Shona",
"py" => "Paraguay", "so" => "Somali",
"pe" => "Peru", "sq" => "Albanian",
"ph" => "Philippines", "sr" => "Serbian",
"pn" => "Pitcairn", "ss" => "Swati",
"pl" => "Poland", "st" => "Southern Sotho",
"pt" => "Portugal", "su" => "Sundanese",
"pr" => "Puerto Rico", "sv" => "Swedish",
"qa" => "Qatar", "sw" => "Swahili",
"ro" => "Romania", "ta" => "Tamil",
"ru" => "Russian Federation", "te" => "Telugu",
"rw" => "Rwanda", "tg" => "Tajik",
"re" => "Réunion", "th" => "Thai",
"sh" => "Saint Helena", "ti" => "Tigrinya",
"kn" => "Saint Kitts and Nevis", "tk" => "Turkmen",
"lc" => "Saint Lucia", "tl" => "Tagalog",
"vc" => "Saint Vincent and the Grenadines", "tn" => "Tswana",
"ws" => "Samoa",
"sm" => "San Marino",
"st" => "Sao Tome and Principe",
"sa" => "Saudi Arabia",
"sn" => "Senegal",
"rs" => "Serbia",
"sc" => "Seychelles",
"sl" => "Sierra Leone",
"sg" => "Singapore",
"sk" => "Slovakia",
"si" => "Slovenia",
"sb" => "Solomon Islands",
"so" => "Somalia",
"kr" => "Sourth Korea",
"za" => "South Africa",
"es" => "Spain",
"lk" => "Sri Lanka",
"sr" => "Suriname",
"se" => "Sweden",
"ch" => "Switzerland",
"tw" => "Taiwan",
"tj" => "Tajikistan",
"tz" => "Tanzania",
"th" => "Thailand",
"tl" => "Timor-Leste",
"tg" => "Togo",
"tk" => "Tokelau",
"to" => "Tonga", "to" => "Tonga",
"tt" => "Trinidad and Tobago", "tr" => "Turkish",
"tn" => "Tunisia", "ts" => "Tsonga",
"tr" => "Turkey", "tt" => "Tatar",
"tm" => "Turkmenistan", "tw" => "Twi",
"ug" => "Uganda", "ty" => "Tahitian",
"ua" => "Ukraine", "ug" => "Uyghur",
"ae" => "United Arab Emirates", "uk" => "Ukrainian",
"gb" => "United Kingdom", "ur" => "Urdu",
"us" => "United States", "uz" => "Uzbek",
"uy" => "Uruguay", "ve" => "Venda",
"uz" => "Uzbekistan", "vi" => "Vietnamese",
"vu" => "Vanuatu", "vo" => "Volapük",
"ve" => "Venezuela", "wa" => "Walloon",
"vn" => "Vietnam", "wo" => "Wolof",
"vg" => "Virgin Islands, British", "xh" => "Xhosa",
"vi" => "Virgin Islands, U.S.", "yi" => "Yiddish",
"ye" => "Yemen", "yo" => "Yoruba",
"zm" => "Zambia", "za" => "Zhuang",
"zw" => "Zimbabwe" "zh" => "Chinese",
"zh-cn" => "Chinese (Simplified)",
"zh-tw" => "Chinese (Traditional)",
"zu" => "Zulu"
] ]
], ],
"nsfw" => [ "nsfw" => [
"display" => "NSFW", "display" => "NSFW",
"option" => [ "option" => [
"yes" => "Yes", "yes" => "Yes",
"maybe" => "Maybe",
"no" => "No" "no" => "No"
] ]
] ]
]; ];
} }
private function get($proxy, $url, $get = []){ private function get($proxy, $url, $get = [], $use_api = false, $post_data = null, $bearer = null){
$curlproc = curl_init(); $curlproc = curl_init();
@@ -256,21 +231,37 @@ class yep{
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT, if($use_api){
"Accept: */*",
"Accept-Language: en-US,en;q=0.5", $post_data = json_encode($post_data);
"Accept-Encoding: gzip, deflate, br, zstd",
"Referer: https://yep.com/", curl_setopt($curlproc, CURLOPT_HTTPHEADER,
"Origin: https://yep.com", ["Content-Type: application/json",
"DNT: 1", "Authorization: Bearer $bearer",
"Connection: keep-alive", "Content-Length: " . strlen($post_data)]
"Sec-Fetch-Dest: empty", );
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site", curl_setopt($curlproc, CURLOPT_POST, true);
"Priority: u=4", curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data);
"TE: trailers"] }else{
);
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"Referer: https://yep.com/",
"Origin: https://yep.com",
"DNT: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"Priority: u=4",
"TE: trailers"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
@@ -295,22 +286,17 @@ class yep{
public function web($get){ public function web($get){
if(config::YEP_USE_API){
return $this->web_api($get);
}
$search = $get["s"]; $search = $get["s"];
if(strlen($search) === 0){ if(strlen($search) === 0){
throw new Exception("Search term is empty!"); throw new Exception("Search term is empty!");
} }
$country = $get["country"];
$nsfw = $get["nsfw"];
switch($nsfw){
case "yes": $nsfw = "off"; break;
case "maybe": $nsfw = "moderate"; break;
case "no": $nsfw = "strict"; break;
}
$out = [ $out = [
"status" => "ok", "status" => "ok",
"spelling" => [ "spelling" => [
@@ -327,22 +313,23 @@ class yep{
"related" => [] "related" => []
]; ];
// parse filters
$filters = [
"limit" => 100, // wwwwwwwwwwwwwww
"query" => $search,
];
if($get["nsfw"] == "no"){ $filters["safeSearch"] = "moderate"; }
if($get["lang"] != "any"){ $filters["hl"] = $get["lang"]; }
try{ try{
// https://api.yep.com/fs/2/search?client=web&gl=CA&no_correct=false&q=undefined+variable+javascript&safeSearch=off&type=web // https://api.yep.com/search?limit=20&query=asmr
$json = $json =
$this->get( $this->get(
$this->backend->get_ip(), $this->backend->get_ip(),
"https://api.yep.com/fs/2/search", "https://api.yep.com/search",
[ $filters
"client" => "web",
"gl" => $country == "all" ? $country : strtoupper($country),
"limit" => "99999",
"no_correct" => "false",
"q" => $search,
"safeSearch" => $nsfw,
"type" => "web"
]
); );
}catch(Exception $error){ }catch(Exception $error){
@@ -408,7 +395,7 @@ class yep{
) )
), ),
"url" => $item["url"], "url" => $item["url"],
"date" => strtotime($item["first_seen"]), "date" => null,
"type" => "web", "type" => "web",
"thumb" => [ "thumb" => [
"url" => null, "url" => null,
@@ -422,83 +409,11 @@ class yep{
} }
} }
if(isset($json[1]["featured_news"])){
foreach($json[1]["featured_news"] as $news){
$out["news"][] = [
"title" => $news["title"],
"description" =>
$this->titledots(
strip_tags(
html_entity_decode(
$news["snippet"]
)
)
),
"date" => strtotime($news["first_seen"]),
"thumb" =>
isset($news["img"]) ?
[
"url" => $this->unshiturl($news["img"]),
"ratio" => "16:9"
] :
[
"url" => null,
"ratio" => null
],
"url" => $news["url"]
];
}
}
if(isset($json[1]["featured_images"])){
foreach($json[1]["featured_images"] as $image){
if(
$image["width"] !== 0 &&
$image["height"] !== 0
){
$thumb_width = $image["width"] >= 260 ? 260 : $image["width"];
$thumb_height = ceil($image["height"] * ($thumb_width / $image["width"]));
$width = $image["width"];
$height = $image["height"];
}else{
$thumb_width = null;
$thumb_height = null;
$width = null;
$height = null;
}
$out["image"][] = [
"title" => $image["title"],
"source" => [
[
"url" => $image["image_id"],
"width" => $width,
"height" => $height
],
[
"url" => $image["src"],
"width" => $thumb_width,
"height" => $thumb_height
]
],
"url" => $image["host_page"]
];
}
}
return $out; return $out;
} }
private function web_api($get){
public function image($get){
$search = $get["s"]; $search = $get["s"];
if(strlen($search) === 0){ if(strlen($search) === 0){
@@ -506,142 +421,53 @@ class yep{
throw new Exception("Search term is empty!"); throw new Exception("Search term is empty!");
} }
$country = $get["country"];
$nsfw = $get["nsfw"];
switch($nsfw){
case "yes": $nsfw = "off"; break;
case "maybe": $nsfw = "moderate"; break;
case "no": $nsfw = "strict"; break;
}
$out = [ $out = [
"status" => "ok", "status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null, "npt" => null,
"image" => [] "answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
]; ];
// parse filters
$filters = [
"query" => $search,
"limit" => 100
];
if($get["nsfw"] == "no"){ $filters["safe_search"] = true; }
if($get["lang"] != "any"){ $filters["language"] = [ $get["lang"] ]; }
// add api key
$key_data = $this->backend->get_key();
try{ try{
$json = $json =
$this->get( $this->get(
$this->backend->get_ip(), // no nextpage! $this->backend->get_ip($key_data["increment"]),
"https://api.yep.com/fs/2/search", "https://platform.yep.com/api/search",
[ [],
"client" => "web", true,
"gl" => $country == "all" ? $country : strtoupper($country), $filters,
"no_correct" => "false", $key_data["key"]
"q" => $search,
"safeSearch" => $nsfw,
"type" => "images"
]
); );
}catch(Exception $error){ }catch(Exception $error){
throw new Exception("Failed to fetch JSON"); throw new Exception("Failed to fetch JSON");
} }
$this->detect_cf($json); // should never happen
//$this->detect_cf($json);
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
if(isset($json[1]["results"])){
foreach($json[1]["results"] as $item){
if(
$item["width"] !== 0 &&
$item["height"] !== 0
){
$thumb_width = $item["width"] >= 260 ? 260 : $item["width"];
$thumb_height = ceil($item["height"] * ($thumb_width / $item["width"]));
$width = $item["width"];
$height = $item["height"];
}else{
$thumb_width = null;
$thumb_height = null;
$width = null;
$height = null;
}
$out["image"][] = [
"title" => $item["title"],
"source" => [
[
"url" => $item["image_id"],
"width" => $width,
"height" => $height
],
[
"url" => $item["src"],
"width" => $thumb_width,
"height" => $thumb_height
]
],
"url" => $item["host_page"]
];
}
}
return $out;
}
public function news($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
switch($nsfw){
case "yes": $nsfw = "off"; break;
case "maybe": $nsfw = "moderate"; break;
case "no": $nsfw = "strict"; break;
}
$out = [
"status" => "ok",
"npt" => null,
"news" => []
];
try{
// https://api.yep.com/fs/2/search?client=web&gl=CA&no_correct=false&q=undefined+variable+javascript&safeSearch=off&type=web
$json =
$this->get(
$this->backend->get_ip(),
"https://api.yep.com/fs/2/search",
[
"client" => "web",
"gl" => $country == "all" ? $country : strtoupper($country),
"limit" => "99999",
"no_correct" => "false",
"q" => $search,
"safeSearch" => $nsfw,
"type" => "news"
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$this->detect_cf($json);
$json = json_decode($json, true); $json = json_decode($json, true);
//$json = json_decode(file_get_contents("scraper/yep.json"), true); //$json = json_decode(file_get_contents("scraper/yep.json"), true);
@@ -651,34 +477,53 @@ class yep{
throw new Exception("Failed to decode JSON"); throw new Exception("Failed to decode JSON");
} }
if(isset($json[1]["results"])){ if(isset($json["error"])){
foreach($json[1]["results"] as $item){
throw new Exception("Yep API returned an error: " . $json["error"]);
}
if(isset($json["errors"])){
throw new Exception("Yep API returned the following errors: {$json["message"]}");
}
if(
isset($json["success"]) &&
$json["success"] !== true
){
throw new Exception("Yep API returned a false-y success value");
}
if(!isset($json["results"])){
throw new Exception("Yep API did not return a results object");
}
foreach($json["results"] as $item){
if(
$item["url"] === null ||
$item["url"] == ""
){
$out["news"][] = [ // sometimes API fucks up
"title" => $item["title"], continue;
"author" => null,
"description" =>
$this->titledots(
strip_tags(
html_entity_decode(
$item["snippet"]
)
)
),
"date" => strtotime($item["first_seen"]),
"thumb" =>
isset($item["img"]) ?
[
"url" => $this->unshiturl($item["img"]),
"ratio" => "16:9"
] :
[
"url" => null,
"ratio" => null
],
"url" => $item["url"]
];
} }
$out["web"][] = [
"title" => $item["title"],
"description" => $item["description"],
"url" => $item["url"],
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
} }
return $out; return $out;

View File

@@ -161,14 +161,6 @@ $settings = [
"value" => "yep", "value" => "yep",
"text" => "Yep" "text" => "Yep"
], ],
[
"value" => "greppr",
"text" => "Greppr"
],
[
"value" => "crowdview",
"text" => "Crowdview"
],
[ [
"value" => "mwmbl", "value" => "mwmbl",
"text" => "Mwmbl" "text" => "Mwmbl"
@@ -196,10 +188,6 @@ $settings = [
[ [
"value" => "wiby", "value" => "wiby",
"text" => "wiby" "text" => "wiby"
],
[
"value" => "curlie",
"text" => "Curlie"
] ]
] ]
], ],
@@ -223,6 +211,10 @@ $settings = [
"value" => "google", "value" => "google",
"text" => "Google" "text" => "Google"
], ],
[
"value" => "google_api",
"text" => "Google API"
],
[ [
"value" => "google_cse", "value" => "google_cse",
"text" => "Google CSE" "text" => "Google CSE"
@@ -239,10 +231,6 @@ $settings = [
"value" => "qwant", "value" => "qwant",
"text" => "Qwant" "text" => "Qwant"
], ],
[
"value" => "yep",
"text" => "Yep"
],
[ [
"value" => "baidu", "value" => "baidu",
"text" => "Baidu" "text" => "Baidu"
@@ -379,10 +367,6 @@ $settings = [
"value" => "qwant", "value" => "qwant",
"text" => "Qwant" "text" => "Qwant"
], ],
[
"value" => "yep",
"text" => "Yep"
],
[ [
"value" => "mojeek", "value" => "mojeek",
"text" => "Mojeek" "text" => "Mojeek"