2018-12-31 20:04:05 +01:00
|
|
|
//
|
2024-01-01 01:07:21 +01:00
|
|
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2024
|
2018-12-31 20:04:05 +01:00
|
|
|
//
|
|
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
//
|
|
|
|
#include "td/utils/HttpUrl.h"
|
|
|
|
|
|
|
|
#include "td/utils/format.h"
|
|
|
|
#include "td/utils/logging.h"
|
|
|
|
#include "td/utils/misc.h"
|
|
|
|
#include "td/utils/Parser.h"
|
2020-05-16 21:53:19 +02:00
|
|
|
#include "td/utils/port/IPAddress.h"
|
2023-01-30 11:40:09 +01:00
|
|
|
#include "td/utils/SliceBuilder.h"
|
2018-12-31 20:04:05 +01:00
|
|
|
|
2021-05-28 12:14:51 +02:00
|
|
|
#include <algorithm>
|
|
|
|
|
2018-12-31 20:04:05 +01:00
|
|
|
namespace td {
|
|
|
|
|
|
|
|
string HttpUrl::get_url() const {
|
|
|
|
string result;
|
|
|
|
switch (protocol_) {
|
2020-06-15 03:23:47 +02:00
|
|
|
case Protocol::Http:
|
2018-12-31 20:04:05 +01:00
|
|
|
result += "http://";
|
|
|
|
break;
|
2020-06-15 03:23:47 +02:00
|
|
|
case Protocol::Https:
|
2018-12-31 20:04:05 +01:00
|
|
|
result += "https://";
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
UNREACHABLE();
|
|
|
|
}
|
|
|
|
if (!userinfo_.empty()) {
|
|
|
|
result += userinfo_;
|
|
|
|
result += '@';
|
|
|
|
}
|
|
|
|
result += host_;
|
|
|
|
if (specified_port_ > 0) {
|
|
|
|
result += ':';
|
|
|
|
result += to_string(specified_port_);
|
|
|
|
}
|
2019-06-27 19:08:58 +02:00
|
|
|
LOG_CHECK(!query_.empty() && query_[0] == '/') << query_;
|
2018-12-31 20:04:05 +01:00
|
|
|
result += query_;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2019-08-05 11:42:42 +02:00
|
|
|
Result<HttpUrl> parse_url(Slice url, HttpUrl::Protocol default_protocol) {
|
2018-12-31 20:04:05 +01:00
|
|
|
// url == [https?://][userinfo@]host[:port]
|
2019-08-05 11:42:42 +02:00
|
|
|
ConstParser parser(url);
|
2020-05-07 02:12:47 +02:00
|
|
|
string protocol_str = to_lower(parser.read_till_nofail(":/?#@[]"));
|
2018-12-31 20:04:05 +01:00
|
|
|
|
|
|
|
HttpUrl::Protocol protocol;
|
2020-07-21 13:24:55 +02:00
|
|
|
if (parser.try_skip("://")) {
|
2018-12-31 20:04:05 +01:00
|
|
|
if (protocol_str == "http") {
|
2020-06-15 03:23:47 +02:00
|
|
|
protocol = HttpUrl::Protocol::Http;
|
2018-12-31 20:04:05 +01:00
|
|
|
} else if (protocol_str == "https") {
|
2020-06-15 03:23:47 +02:00
|
|
|
protocol = HttpUrl::Protocol::Https;
|
2018-12-31 20:04:05 +01:00
|
|
|
} else {
|
|
|
|
return Status::Error("Unsupported URL protocol");
|
|
|
|
}
|
|
|
|
} else {
|
2019-08-05 11:42:42 +02:00
|
|
|
parser = ConstParser(url);
|
2018-12-31 20:04:05 +01:00
|
|
|
protocol = default_protocol;
|
|
|
|
}
|
|
|
|
Slice userinfo_host_port = parser.read_till_nofail("/?#");
|
|
|
|
|
|
|
|
int port = 0;
|
|
|
|
const char *colon = userinfo_host_port.end() - 1;
|
|
|
|
while (colon > userinfo_host_port.begin() && *colon != ':' && *colon != ']' && *colon != '@') {
|
|
|
|
colon--;
|
|
|
|
}
|
|
|
|
Slice userinfo_host;
|
|
|
|
if (colon > userinfo_host_port.begin() && *colon == ':') {
|
2021-06-27 02:58:26 +02:00
|
|
|
Slice port_slice(colon + 1, userinfo_host_port.end());
|
|
|
|
while (port_slice.size() > 1 && port_slice[0] == '0') {
|
|
|
|
port_slice.remove_prefix(1);
|
|
|
|
}
|
|
|
|
auto r_port = to_integer_safe<int>(port_slice);
|
2021-05-26 00:20:49 +02:00
|
|
|
if (r_port.is_error() || r_port.ok() == 0) {
|
|
|
|
port = -1;
|
|
|
|
} else {
|
|
|
|
port = r_port.ok();
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
userinfo_host = Slice(userinfo_host_port.begin(), colon);
|
|
|
|
} else {
|
|
|
|
userinfo_host = userinfo_host_port;
|
|
|
|
}
|
|
|
|
if (port < 0 || port > 65535) {
|
|
|
|
return Status::Error("Wrong port number specified in the URL");
|
|
|
|
}
|
|
|
|
|
|
|
|
auto at_pos = userinfo_host.rfind('@');
|
|
|
|
Slice userinfo = at_pos == static_cast<size_t>(-1) ? "" : userinfo_host.substr(0, at_pos);
|
|
|
|
Slice host = userinfo_host.substr(at_pos + 1);
|
|
|
|
|
|
|
|
bool is_ipv6 = false;
|
|
|
|
if (!host.empty() && host[0] == '[' && host.back() == ']') {
|
2020-05-16 21:53:19 +02:00
|
|
|
IPAddress ip_address;
|
|
|
|
if (ip_address.init_ipv6_port(host.str(), 1).is_error()) {
|
|
|
|
return Status::Error("Wrong IPv6 address specified in the URL");
|
|
|
|
}
|
|
|
|
CHECK(ip_address.is_ipv6());
|
2018-12-31 20:04:05 +01:00
|
|
|
is_ipv6 = true;
|
|
|
|
}
|
|
|
|
if (host.empty()) {
|
|
|
|
return Status::Error("URL host is empty");
|
|
|
|
}
|
2019-02-12 15:53:00 +01:00
|
|
|
if (host == ".") {
|
|
|
|
return Status::Error("Host is invalid");
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
|
|
|
|
int specified_port = port;
|
|
|
|
if (port == 0) {
|
2020-06-15 03:23:47 +02:00
|
|
|
if (protocol == HttpUrl::Protocol::Http) {
|
2018-12-31 20:04:05 +01:00
|
|
|
port = 80;
|
|
|
|
} else {
|
2020-06-15 03:23:47 +02:00
|
|
|
CHECK(protocol == HttpUrl::Protocol::Https);
|
2018-12-31 20:04:05 +01:00
|
|
|
port = 443;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice query = parser.read_all();
|
|
|
|
while (!query.empty() && is_space(query.back())) {
|
|
|
|
query.remove_suffix(1);
|
|
|
|
}
|
|
|
|
if (query.empty()) {
|
2019-02-13 00:29:52 +01:00
|
|
|
query = Slice("/");
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
|
|
|
string query_str;
|
|
|
|
if (query[0] != '/') {
|
|
|
|
query_str = '/';
|
|
|
|
}
|
|
|
|
for (auto c : query) {
|
|
|
|
if (static_cast<unsigned char>(c) <= 0x20) {
|
|
|
|
query_str += '%';
|
|
|
|
query_str += "0123456789ABCDEF"[c / 16];
|
|
|
|
query_str += "0123456789ABCDEF"[c % 16];
|
|
|
|
} else {
|
|
|
|
query_str += c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-30 11:40:09 +01:00
|
|
|
auto check_url_part = [](Slice part, Slice name, bool allow_colon) {
|
|
|
|
for (size_t i = 0; i < part.size(); i++) {
|
|
|
|
char c = part[i];
|
|
|
|
if (is_alnum(c) || c == '.' || c == '-' || c == '_' || c == '!' || c == '$' || c == ',' || c == '~' || c == '*' ||
|
|
|
|
c == '\'' || c == '(' || c == ')' || c == ';' || c == '&' || c == '+' || c == '=' ||
|
|
|
|
(allow_colon && c == ':')) {
|
|
|
|
// symbols allowed by RFC 7230 and RFC 3986
|
2018-07-01 01:29:36 +02:00
|
|
|
continue;
|
|
|
|
}
|
2023-01-30 11:40:09 +01:00
|
|
|
if (c == '%') {
|
|
|
|
c = part[++i];
|
|
|
|
if (is_hex_digit(c)) {
|
|
|
|
c = part[++i];
|
|
|
|
if (is_hex_digit(c)) {
|
|
|
|
// percent encoded symbol as allowed by RFC 7230 and RFC 3986
|
|
|
|
continue;
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
2023-01-30 11:40:09 +01:00
|
|
|
return Status::Error(PSLICE() << "Wrong percent-encoded symbol in URL " << name);
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
2023-01-30 11:40:09 +01:00
|
|
|
|
|
|
|
// all other symbols aren't allowed
|
|
|
|
auto uc = static_cast<unsigned char>(c);
|
|
|
|
if (uc >= 128) {
|
|
|
|
// but we allow plain UTF-8 symbols
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
return Status::Error(PSLICE() << "Disallowed character in URL " << name);
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
2023-01-30 11:40:09 +01:00
|
|
|
return Status::OK();
|
|
|
|
};
|
2018-07-01 01:29:36 +02:00
|
|
|
|
2023-01-30 11:40:09 +01:00
|
|
|
string host_str = to_lower(host);
|
|
|
|
if (is_ipv6) {
|
|
|
|
for (size_t i = 1; i + 1 < host_str.size(); i++) {
|
|
|
|
char c = host_str[i];
|
|
|
|
if (c == ':' || ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || c == '.') {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
return Status::Error("Wrong IPv6 URL host");
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
2023-01-30 11:40:09 +01:00
|
|
|
} else {
|
|
|
|
TRY_STATUS(check_url_part(host_str, "host", false));
|
|
|
|
TRY_STATUS(check_url_part(userinfo, "userinfo", true));
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return HttpUrl{protocol, userinfo.str(), std::move(host_str), is_ipv6, specified_port, port, std::move(query_str)};
|
|
|
|
}
|
|
|
|
|
|
|
|
StringBuilder &operator<<(StringBuilder &sb, const HttpUrl &url) {
|
2020-06-15 03:23:47 +02:00
|
|
|
sb << tag("protocol", url.protocol_ == HttpUrl::Protocol::Http ? "HTTP" : "HTTPS") << tag("userinfo", url.userinfo_)
|
2018-12-31 20:04:05 +01:00
|
|
|
<< tag("host", url.host_) << tag("port", url.port_) << tag("query", url.query_);
|
|
|
|
return sb;
|
|
|
|
}
|
|
|
|
|
2021-05-24 22:47:18 +02:00
|
|
|
HttpUrlQuery parse_url_query(Slice query) {
|
2021-05-25 03:39:41 +02:00
|
|
|
if (!query.empty() && query[0] == '/') {
|
|
|
|
query.remove_prefix(1);
|
|
|
|
}
|
|
|
|
|
2021-05-24 22:47:18 +02:00
|
|
|
size_t path_size = 0;
|
|
|
|
while (path_size < query.size() && query[path_size] != '?' && query[path_size] != '#') {
|
|
|
|
path_size++;
|
|
|
|
}
|
|
|
|
|
|
|
|
HttpUrlQuery result;
|
2021-05-25 03:39:41 +02:00
|
|
|
result.path_ = full_split(url_decode(query.substr(0, path_size), false), '/');
|
2022-09-16 00:30:54 +02:00
|
|
|
while (!result.path_.empty() && result.path_.back().empty()) {
|
2021-05-25 03:39:41 +02:00
|
|
|
result.path_.pop_back();
|
|
|
|
}
|
2021-05-24 22:47:18 +02:00
|
|
|
|
|
|
|
if (path_size < query.size() && query[path_size] == '?') {
|
|
|
|
query = query.substr(path_size + 1);
|
|
|
|
query.truncate(query.find('#'));
|
|
|
|
|
|
|
|
ConstParser parser(query);
|
|
|
|
while (!parser.data().empty()) {
|
|
|
|
auto key_value = split(parser.read_till_nofail('&'), '=');
|
|
|
|
parser.skip_nofail('&');
|
|
|
|
auto key = url_decode(key_value.first, true);
|
|
|
|
if (!key.empty()) {
|
|
|
|
result.args_.emplace_back(std::move(key), url_decode(key_value.second, true));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
CHECK(parser.status().is_ok());
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2022-04-01 13:00:34 +02:00
|
|
|
bool HttpUrlQuery::has_arg(Slice key) const {
|
|
|
|
auto it =
|
|
|
|
std::find_if(args_.begin(), args_.end(), [&key](const std::pair<string, string> &s) { return s.first == key; });
|
|
|
|
return it != args_.end();
|
|
|
|
}
|
|
|
|
|
2021-05-24 22:47:18 +02:00
|
|
|
Slice HttpUrlQuery::get_arg(Slice key) const {
|
|
|
|
auto it =
|
|
|
|
std::find_if(args_.begin(), args_.end(), [&key](const std::pair<string, string> &s) { return s.first == key; });
|
|
|
|
return it == args_.end() ? Slice() : it->second;
|
|
|
|
}
|
|
|
|
|
2022-09-26 18:24:39 +02:00
|
|
|
string get_url_host(Slice url) {
|
|
|
|
auto r_http_url = parse_url(url);
|
|
|
|
if (r_http_url.is_error()) {
|
|
|
|
return string();
|
|
|
|
}
|
|
|
|
return r_http_url.ok().host_;
|
|
|
|
}
|
|
|
|
|
2018-02-19 23:54:50 +01:00
|
|
|
string get_url_query_file_name(const string &query) {
|
|
|
|
Slice query_slice = query;
|
|
|
|
query_slice.truncate(query.find_first_of("?#"));
|
|
|
|
|
|
|
|
auto slash_pos = query_slice.rfind('/');
|
|
|
|
if (slash_pos < query_slice.size()) {
|
|
|
|
return query_slice.substr(slash_pos + 1).str();
|
|
|
|
}
|
|
|
|
return query_slice.str();
|
|
|
|
}
|
|
|
|
|
2019-08-05 11:56:28 +02:00
|
|
|
string get_url_file_name(Slice url) {
|
|
|
|
auto r_http_url = parse_url(url);
|
2018-02-20 00:29:19 +01:00
|
|
|
if (r_http_url.is_error()) {
|
|
|
|
LOG(WARNING) << "Receive wrong URL \"" << url << '"';
|
|
|
|
return string();
|
|
|
|
}
|
|
|
|
return get_url_query_file_name(r_http_url.ok().query_);
|
|
|
|
}
|
|
|
|
|
2018-12-31 20:04:05 +01:00
|
|
|
} // namespace td
|