backend = new backend("naver"); include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); } public function getfilters($page){ $base = [ "time" => [ "display" => "Time", "option" => [ "any" => "Any time", "1h" => "Last hour", "1d" => "Last day", "1w" => "Last week", "1m" => "Last month", "3m" => "Last 3 months", "6m" => "Last 6 months", "1y" => "Last year", ] ] ]; switch($page){ case "web": return array_merge([ "sort" => [ "display" => "Sort by", "option" => [ "relevance" => "Relevance", // r "most_recent" => "Most recent" // dd ] ] ], $base); break; case "images": return array_merge( $base, [ "size" => [ "display" => "Size", "option" => [ "any" => "Any size", "highdef" => "High definition" // &res_fr=786432&res_to=100000000 ] ], "color" => [ // &color= "display" => "Color", "option" => [ "any" => "Any color", "orange" => "Orange", "yellow" => "Yellow", "lime" => "Lime", "green" => "Green", "cyan" => "Cyan", "blue" => "Blue", "purple" => "Purple", "pink" => "Pink", "apricot" => "Apricot", "ocher" => "Ocher", "sepia" => "Sepia", "black" => "Black", "gray" => "Gray", "white" => "White" ] ], "license" => [ // &ccl= "display" => "License", "option" => [ "any" => "Any license", "1" => "CCL Total", "2" => "Commercial use", "4" => "Modifications permitted" ] ] ] ); break; case "videos": return [ "time" => [ // done "display" => "Time", "option" => [ "any" => "Any time", "1day" => "Last day", "1week" => "Last week", "1month" => "Last month", "3month" => "Last 3 months", "6month" => "Last 6 months", "1year" => "Last year" ] ], "sort" => [ // done "display" => "Sort by", "option" => [ "rel" => "Relevance", "date" => "Most recent", // &sort=date "playcount" => "Most views", // &sort=playcount ] ], "type" => [ // done "display" => "Type", "option" => [ "any" => "Any videos", "shorts" => "Shorts" // dtype=shorts ] ], "duration" => [ "display" => "Duration", // &playtime= "option" => [ "any" => "Any duration", "0:600" => "10 minutes", "601:1800" => "10-30 minutes", "1801:3600" => "30-60 minutes", "3601:65535" => "More than 1 hour" ] ] ]; break; } } private function get($proxy, $url, $get = [], $is_xhr = false){ $curlproc = curl_init(); if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } curl_setopt($curlproc, CURLOPT_URL, $url); // use http2 curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding if($is_xhr === false){ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", "Sec-GPC: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: same-origin", "Priority: u=0, i", "Sec-Fetch-User: ?1"] ); }else{ curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Language: en-US,en;q=0.9", "Accept-Encoding: gzip, deflate, br, zstd", "Referer: https://search.naver.com/", "DNT: 1", "Sec-GPC: 1", "Alt-Used: s.search.naver.com", "Connection: keep-alive", "Sec-Fetch-Dest: script", "Sec-Fetch-Mode: no-cors", "Sec-Fetch-Site: same-site", "TE: trailers"] ); } curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } curl_close($curlproc); return $data; } public function web($get){ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; if($get["npt"]){ [$d, $proxy] = $this->backend->get($get["npt"], "web"); try{ $html = $this->get( $proxy, "https://search.naver.com/search.naver" . $d, [] ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } }else{ // parse filters // https://search.naver.com // /search.naver // ?nso= // &page=1 // &query=nisekoi // &sm=tab_pge // &start=1 // &where=web $filters = [ "nso" => "", "query" => $search, "sm" => "tab_pge", "where" => "web", "start" => 1 // increment by number of results each time (16??) ]; $options = []; if($get["sort"] != "relevance"){ $options[] = "so:dd"; } if($get["time"] != "any"){ $options[] = "p:" . $get["time"]; } if(count($options) !== 0){ $filters["nso"] = implode(",", $options); } //$html = file_get_contents("scraper/naver.html"); $proxy = $this->backend->get_ip(); try{ $html = $this->get( $proxy, "https://search.naver.com/search.naver", $filters ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } } $this->fuckhtml->load($html); $results = preg_split( '/entry\.bootstrap\(document\.getElementById\("[a-f0-9-r]+"\), ?/', $html ); if(count($results) !== 2){ // this is thrown when no results are found $nsfw_probe = $this->fuckhtml ->getElementsByClassName( "dsc_adult", "div" ); if(count($nsfw_probe) !== 0){ $out["answer"][] = [ "title" => "NSFW results", "description" => [ [ "type" => "text", "value" => "Naver blocks logged-out NSFW searches." ] ], "url" => null, "thumb" => null, "table" => [], "sublink" => [] ]; } return $out; //throw new Exception("Failed to grep results entrypoint"); } $json = json_decode( $this->fuckhtml ->extract_json( $results[1] ), true ); if(!isset($json["body"]["props"]["children"][0]["props"]["children"])){ throw new Exception("Failed to access nested children"); } foreach($json["body"]["props"]["children"][0]["props"]["children"] as $result){ if( !isset($result["templateId"]) || $result["templateId"] != "webItem" ){ // should not happen continue; } $result = $result["props"]; // get sublinks $sublinks = []; if(isset($result["subLinks"])){ foreach($result["subLinks"] as $s){ $sublinks[] = [ "title" => $s["text"], "description" => null, "url" => $s["href"], "date" => null ]; } } if(isset($result["linkBtns"])){ foreach($result["linkBtns"] as $s){ $sublinks[] = [ "title" => $s["text"], "description" => null, "url" => $s["href"], "date" => null ]; } } // get image (thumbnail, i guess) if(isset($result["images"][0]["imageSrc"])){ $thumb = [ "ratio" => "16:9", "url" => $this->unshit_thumb($result["images"][0]["imageSrc"]) ]; }else{ $thumb = [ "ratio" => null, "url" => null ]; } // get table elements $table = []; if(isset($result["keyValue"]["contents"])){ foreach($result["keyValue"]["contents"] as $s){ if(!isset($s["valueData"]["text"])){ continue; } $table[$s["key"]] = $s["valueData"]["text"]; } } // get date $time = null; if(isset($result["bodyPrefixes"][0]["text"])){ $date = strtotime( substr( $result["bodyPrefixes"][0]["text"], -1 ) ); if($date !== false){ $time = $date; } } $out["web"][] = [ "title" => $this->decode_html($result["title"]), "description" => $this->decode_html($result["bodyText"]), "url" => $result["href"], "date" => $time, "type" => "web", "thumb" => $thumb, "sublink" => $sublinks, "table" => $table ]; } // get next page $npt = $this->fuckhtml ->getElementsByClassName( "btn_next", "a" ); if(count($npt) !== 0){ $out["npt"] = $this->backend->store( $this->fuckhtml ->getTextContent( $npt[0]["attributes"]["href"] ), "web", $proxy ); } return $out; } public function image($get){ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $out = [ "status" => "ok", "npt" => null, "image" => [] ]; if($get["npt"]){ [$url, $proxy] = $this->backend->get($get["npt"], "images"); try{ $json = $this->get( $proxy, $url, [], true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } }else{ $filters = [ "ac" => "0", "api_type" => "pc_tab_more", "aq" => "0", "display" => 100, "logStart" => 1, "mode" => "column", "nso" => "so:r,p:all", "nx_search_query" => $search, "query" => $search, "section" => "image", "sm" => "tab_opt", "ssc" => "tab.image.all", "start" => 1, "where" => "image" // no callback, returns raw json lol ]; $options = [ "so:r" ]; if($get["time"] != "any"){ $options[] = "p:" . $get["time"]; } if(count($options) !== 0){ $filters["nso"] = implode(",", $options); } if($get["size"] != "any"){ $filters["res_fr"] = 786432; $filters["res_to"] = 100000000; } if($get["color"] != "any"){ $filters["color"] = $get["color"]; } if($get["license"] != "any"){ $filters["ccl"] = $get["license"]; } //$json = file_get_contents("scraper/naver.html"); $proxy = $this->backend->get_ip(); try{ $json = $this->get( $proxy, "https://s.search.naver.com/p/c/image/46/search.naver", $filters, true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } } $json = json_decode($json, true); if($json === null){ throw new Exception("Failed to decode JSON"); } if(!isset($json["items"])){ // no results returned :( return $out; //throw new Exception("Naver did not return an items object"); } foreach($json["items"] as $image){ $out["image"][] = [ "title" => trim($image["title"], "."), "source" => [ [ "url" => $image["originalUrl"], "width" => (int)$image["orgWidth"], "height" => (int)$image["orgHeight"] ], [ "url" => $image["thumb"], "width" => (int)$image["thumbWidth"], "height" => (int)$image["thumbHeight"] ] ], "url" => $image["link"] ]; } // get npt if( isset($json["url"]) && $json["url"] != "" && $json["url"] != null ){ $out["npt"] = $this->backend->store( $json["url"], "images", $proxy ); } return $out; } public function video($get){ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $out = [ "status" => "ok", "npt" => null, "video" => [], "author" => [], "livestream" => [], "playlist" => [], "reel" => [] ]; if($get["npt"]){ [$url, $proxy] = $this->backend->get($get["npt"], "images"); try{ $json = $this->get( $proxy, $url, [], true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } }else{ // https://s.search.naver.com/p/video/48/search.naver?ac=0&aq=0&crbase=63&display=48&dtype=&last_block_type=recom&nlu_query=&nq=&nqx_theme={"theme":{"main":{"name":"encyclopedia","source":"TOS"}}}&nx_and_query=&nx_search_hlquery=&nx_search_query=&nx_sub_query=&page=2&period=&playtime=&ptype=&query=asmr&selected_channel=&selected_cp=&sm=mtb_pge&sort=rel&ssc=tab.video.all&start=49&video_more=1 // https://s.search.naver.com/p/video/48/search.naver // ?ac=0 // &aq=0 // &crbase=63 // &display=48 // &dtype= // &last_block_type=recom // &nlu_query= // &nq= // &nqx_theme={"theme":{"main":{"name":"encyclopedia","source":"TOS"}}} // &nx_and_query= // &nx_search_hlquery= // &nx_search_query= // &nx_sub_query= // &page=2 // &period= // &playtime= // &ptype= // &query=asmr // &selected_channel= // &selected_cp= // &sm=mtb_pge // &sort=rel // &ssc=tab.video.all // &start=49 // &video_more=1 $filters = [ "ac" => "0", "aq" => "0", "crbase" => "78", "display" => 48, "dtype" => "", "last_block_type" => "recom", "nlu_query" => "", "nq" => "", "nx_and_query" => "", "nx_search_hlquery" => "", "nx_search_query" => "", "nx_sub_query" => "", "page" => 1, "period" => "", "playtime" => "", "ptype" => "", "query" => $search, "selected_channel" => "", "selected_cp" => "", "sm" => "mtb_pge", "sort" => "rel", "ssc" => "tab.video.all", "start" => 1, "video_more" => 1 ]; if($get["type"] != "any"){ $filters["dtype"] = $get["type"]; } if($get["time"] != "any"){ $filters["period"] = $get["time"]; } if($get["sort"] != "rel"){ $filters["sort"] = $get["sort"]; } if($get["duration"] != "any"){ $filters["playtime"] = $get["duration"]; } //$json = file_get_contents("scraper/naver.html"); $proxy = $this->backend->get_ip(); try{ $json = $this->get( $proxy, "https://s.search.naver.com/p/video/48/search.naver", $filters, true ); }catch(Exception $error){ throw new Exception("Failed to fetch search page"); } } $json = json_decode($json, true); if($json === null){ throw new Exception("Failed to decode JSON"); } if(!isset($json["collection"])){ return $out; //throw new Exception("Naver did not return a collection HTML element"); } foreach($json["collection"] as $snippet){ if(!isset($snippet["html"])){ continue; } $this->fuckhtml->load($snippet["html"]); $div = $this->fuckhtml ->getElementsByTagName( "div" ); $items = $this->fuckhtml ->getElementsByAttributeValue( "data-template-id", "videoItem", $div ); // parse normal videos foreach($items as $item){ if($item["level"] === 6){ continue; } $this->fuckhtml->load($item); // get url $as = $this->fuckhtml ->getElementsByAttributeName( "data-heatmap-target", "a" ); if(count($as) === 0){ // should not happen continue; } // get thumbnail $thumb = $this->fuckhtml ->getElementsByAttributeValue( "loading", "lazy", "img" ); if(count($thumb) !== 0){ $thumb = [ "url" => $this->unshit_thumb( $this->fuckhtml ->getTextContent( $thumb[0]["attributes"]["src"] ) ), "ratio" => "16:9" ]; }else{ $thumb = [ "url" => null, "ratio" => null ]; } // get timestamp $timestamp_probe = $this->fuckhtml ->getElementsByClassName( "sds-comps-text-type-footnote", "span" ); if(count($timestamp_probe) !== 0){ $timestamp = $this->hms2int( $this->fuckhtml ->getTextContent( $timestamp_probe[0] ) ); }else{ $timestamp = null; } $out["video"][] = [ "title" => $this->fuckhtml ->getTextContent( $as[0] ), "description" => null, "author" => [ "name" => isset($item["attributes"]["profileimagealt"]) ? $this->fuckhtml ->getTextContent( $item["attributes"]["profileimagealt"] ) : null, "url" => isset($item["attributes"]["profileimagehref"]) ? $this->fuckhtml ->getTextContent( $item["attributes"]["profileimagehref"] ) : null, "avatar" => isset($item["attributes"]["profileimagesrc"]) ? $this->fuckhtml ->getTextContent( $item["attributes"]["profileimagesrc"] ) : null ], "date" => null, "duration" => $timestamp, "views" => null, "thumb" => $thumb, "url" => $this->fuckhtml ->getTextContent( $as[0]["attributes"]["href"] ) ]; } // reset $this->fuckhtml->load($snippet["html"]); // parse reels $carousels = array_merge( $this->fuckhtml // for the reels only tab ->getElementsByClassName( "fds-video-tab-shortform-desk-filter", $div ), $this->fuckhtml // for the normal tab with reels inbetween ->getElementsByClassName( "fds-video-tab-shortform-desk", $div ) ); foreach($carousels as $carousel){ $this->fuckhtml->load($carousel); $as = $this->fuckhtml ->getElementsByTagName( "a" ); foreach($as as $reel){ $this->fuckhtml->load($reel); $spans = $this->fuckhtml ->getElementsByTagName( "span" ); $title = $this->fuckhtml ->getTextContent( $spans[0] ); // get thumbnail $thumb = $this->fuckhtml ->getElementsByAttributeValue( "loading", "lazy", "img" ); if(count($thumb) !== 0){ $thumb = [ "url" => $this->unshit_thumb( $this->fuckhtml ->getTextContent( $thumb[0]["attributes"]["src"] ) ), "ratio" => "16:9" ]; }else{ $thumb = [ "url" => null, "ratio" => null ]; } $name = $this->fuckhtml ->getElementsByClassName( "sds-comps-profile-info-title-text", $spans ); if(count($name) === 0){ $name = null; }else{ $name = $this->fuckhtml ->getTextContent( $name[0] ); } $out["reel"][] = [ "title" => $title, "description" => null, "author" => [ "name" => $name, "url" => null, "avatar" => null ], "date" => null, "duration" => null, "views" => null, "thumb" => $thumb, "url" => $this->fuckhtml ->getTextContent( $reel["attributes"]["href"] ) ]; } } } // get npt if( isset($json["url"]) && $json["url"] != "" && $json["url"] != null ){ $out["npt"] = $this->backend->store( $json["url"], "images", $proxy ); } return $out; } private function unshit_thumb($url){ $parts = parse_url($url); if($parts["host"] == "search.pstatic.net"){ parse_str($parts["query"], $str); if(isset($str["src"])){ return $str["src"]; } } return $url; } private function decode_html($html){ return trim( html_entity_decode( strip_tags( $html ) ), "." ); } private function hms2int($time){ $parts = explode(":", $time, 3); $time = 0; if(count($parts) === 3){ // hours $time = $time + ((int)$parts[0] * 3600); array_shift($parts); } if(count($parts) === 2){ // minutes $time = $time + ((int)$parts[0] * 60); array_shift($parts); } // seconds $time = $time + (int)$parts[0]; return $time; } }