Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly parse HTTP accept headers #75

Merged
merged 12 commits into from
Jan 22, 2025
134 changes: 134 additions & 0 deletions lib/protocol/http/header/accept.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# frozen_string_literal: true

# Released under the MIT License.
# Copyright, 2025, by Samuel Williams.

require_relative "split"
require_relative "quoted_string"
require_relative "../error"

module Protocol
module HTTP
module Header
# The `accept-content-type` header represents a list of content-types that the client can accept.
class Accept < Array
# Regular expression used to split values on commas, with optional surrounding whitespace, taking into account quoted strings.
SEPARATOR = /
(?: # Start non-capturing group
"[^"\\]*" # Match quoted strings (no escaping of quotes within)
| # OR
[^,"]+ # Match non-quoted strings until a comma or quote
)+
(?=,|\z) # Match until a comma or end of string
/x

ParseError = Class.new(Error)

MEDIA_RANGE = /\A(?<type>#{TOKEN})\/(?<subtype>#{TOKEN})(?<parameters>.*)\z/

PARAMETER = /\s*;\s*(?<key>#{TOKEN})=((?<value>#{TOKEN})|(?<quoted_value>#{QUOTED_STRING}))/

# A single entry in the Accept: header, which includes a mime type and associated parameters. A media range can include wild cards, but a media type is a specific type and subtype.
MediaRange = Struct.new(:type, :subtype, :parameters) do
def initialize(type, subtype = "*", parameters = {})
super(type, subtype, parameters)
end

def <=> other
other.quality_factor <=> self.quality_factor
end

def parameters_string
return "" if parameters == nil or parameters.empty?

parameters.collect do |key, value|
";#{key.to_s}=#{QuotedString.quote(value.to_s)}"
end.join
end

def === other
if other.is_a? self.class
super
else
return self.range_string === other
end
end

def range_string
"#{type}/#{subtype}"
end

def to_s
"#{type}/#{subtype}#{parameters_string}"
end

alias to_str to_s

def quality_factor
parameters.fetch("q", 1.0).to_f
end

def split(*args)
return [type, subtype]
end
end

# Parse the `accept` header value into a list of content types.
#
# @parameter value [String] the value of the header.
def initialize(value = nil)
if value
super(value.scan(SEPARATOR).map(&:strip))
end
end

# Adds one or more comma-separated values to the header.
#
# The input string is split into distinct entries and appended to the array.
#
# @parameter value [String] the value or values to add, separated by commas.
def << (value)
self.concat(value.scan(SEPARATOR).map(&:strip))
end

# Serializes the stored values into a comma-separated string.
#
# @returns [String] the serialized representation of the header values.
def to_s
join(",")
end

# Parse the `accept` header.
#
# @returns [Array(Charset)] the list of content types and their associated parameters.
def media_ranges
self.map do |value|
self.parse_media_range(value)
end
end

private

def parse_media_range(value)
if match = value.match(MEDIA_RANGE)
type = match[:type]
subtype = match[:subtype]
parameters = {}

match[:parameters].scan(PARAMETER) do |key, value, quoted_value|
if quoted_value
value = QuotedString.unquote(quoted_value)
end

parameters[key] = value
end

return MediaRange.new(type, subtype, parameters)
else
raise ParseError, "Invalid media type: #{value.inspect}"
end
end
end
end
end
end
45 changes: 45 additions & 0 deletions lib/protocol/http/header/accept_charset.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# frozen_string_literal: true

# Released under the MIT License.
# Copyright, 2025, by Samuel Williams.

require_relative "split"
require_relative "quoted_string"
require_relative "../error"

module Protocol
module HTTP
module Header
# The `accept-charset` header represents a list of character sets that the client can accept.
class AcceptCharset < Split
ParseError = Class.new(Error)

# https://tools.ietf.org/html/rfc7231#section-5.3.3
CHARSET = /\A(?<name>#{TOKEN})(;q=(?<q>#{QVALUE}))?\z/

Charset = Struct.new(:name, :q) do
def quality_factor
(q || 1.0).to_f
end

def <=> other
other.quality_factor <=> self.quality_factor
end
end

# Parse the `accept-charset` header value into a list of character sets.
#
# @returns [Array(Charset)] the list of character sets and their associated quality factors.
def charsets
self.map do |value|
if match = value.match(CHARSET)
Charset.new(match[:name], match[:q])
else
raise ParseError.new("Could not parse character set: #{value.inspect}")
end
end
end
end
end
end
end
48 changes: 48 additions & 0 deletions lib/protocol/http/header/accept_encoding.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# frozen_string_literal: true

# Released under the MIT License.
# Copyright, 2025, by Samuel Williams.

require_relative "split"
require_relative "quoted_string"
require_relative "../error"

module Protocol
module HTTP
module Header
# The `accept-encoding` header represents a list of encodings that the client can accept.
class AcceptEncoding < Split
ParseError = Class.new(Error)

# https://tools.ietf.org/html/rfc7231#section-5.3.1
QVALUE = /0(\.[0-9]{0,3})?|1(\.[0]{0,3})?/

# https://tools.ietf.org/html/rfc7231#section-5.3.4
ENCODING = /\A(?<name>#{TOKEN})(;q=(?<q>#{QVALUE}))?\z/

Encoding = Struct.new(:name, :q) do
def quality_factor
(q || 1.0).to_f
end

def <=> other
other.quality_factor <=> self.quality_factor
end
end

# Parse the `accept-encoding` header value into a list of encodings.
#
# @returns [Array(Charset)] the list of character sets and their associated quality factors.
def encodings
self.map do |value|
if match = value.match(ENCODING)
Encoding.new(match[:name], match[:q])
else
raise ParseError.new("Could not parse encoding: #{value.inspect}")
end
end
end
end
end
end
end
51 changes: 51 additions & 0 deletions lib/protocol/http/header/accept_language.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# frozen_string_literal: true

# Released under the MIT License.
# Copyright, 2025, by Samuel Williams.

require_relative "split"
require_relative "quoted_string"
require_relative "../error"

module Protocol
module HTTP
module Header
# The `accept-language` header represents a list of languages that the client can accept.
class AcceptLanguage < Split
ParseError = Class.new(Error)

# https://tools.ietf.org/html/rfc3066#section-2.1
NAME = /\*|[A-Z]{1,8}(-[A-Z0-9]{1,8})*/i

# https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.9
QVALUE = /0(\.[0-9]{0,6})?|1(\.[0]{0,6})?/

# https://greenbytes.de/tech/webdav/rfc7231.html#quality.values
LANGUAGE = /\A(?<name>#{NAME})(\s*;\s*q=(?<q>#{QVALUE}))?\z/

Language = Struct.new(:name, :q) do
def quality_factor
(q || 1.0).to_f
end

def <=> other
other.quality_factor <=> self.quality_factor
end
end

# Parse the `accept-language` header value into a list of languages.
#
# @returns [Array(Charset)] the list of character sets and their associated quality factors.
def languages
self.map do |value|
if match = value.match(LANGUAGE)
Language.new(match[:name], match[:q])
else
raise ParseError.new("Could not parse language: #{value.inspect}")
end
end
end
end
end
end
end
49 changes: 49 additions & 0 deletions lib/protocol/http/header/quoted_string.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# frozen_string_literal: true

# Released under the MIT License.
# Copyright, 2025, by Samuel Williams.

module Protocol
module HTTP
module Header
# According to https://tools.ietf.org/html/rfc7231#appendix-C
TOKEN = /[!#$%&'*+\-.^_`|~0-9A-Z]+/i

QUOTED_STRING = /"(?:.(?!(?<!\\)"))*.?"/

# https://tools.ietf.org/html/rfc7231#section-5.3.1
QVALUE = /0(\.[0-9]{0,3})?|1(\.[0]{0,3})?/

# Handling of HTTP quoted strings.
module QuotedString
# Unquote a "quoted-string" value according to <https://tools.ietf.org/html/rfc7230#section-3.2.6>. It should already match the QUOTED_STRING pattern above by the parser.
def self.unquote(value, normalize_whitespace = true)
value = value[1...-1]

value.gsub!(/\\(.)/, '\1')

if normalize_whitespace
# LWS = [CRLF] 1*( SP | HT )
value.gsub!(/[\r\n]+\s+/, " ")
end

return value
end

QUOTES_REQUIRED = /[()<>@,;:\\"\/\[\]?={} \t]/

# Quote a string for HTTP header values if required.
#
# @raises [ArgumentError] if the value contains invalid characters like control characters or newlines.
def self.quote(value, force = false)
# Check if quoting is required:
if value =~ QUOTES_REQUIRED or force
"\"#{value.gsub(/["\\]/, '\\\\\0')}\""
else
value
end
end
end
end
end
end
2 changes: 1 addition & 1 deletion lib/protocol/http/header/split.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def initialize(value = nil)
#
# @parameter value [String] the value or values to add, separated by commas.
def << value
self.push(*value.split(COMMA))
self.concat(value.split(COMMA))
end

# Serializes the stored values into a comma-separated string.
Expand Down
14 changes: 13 additions & 1 deletion lib/protocol/http/headers.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# frozen_string_literal: true

# Released under the MIT License.
# Copyright, 2018-2024, by Samuel Williams.
# Copyright, 2018-2025, by Samuel Williams.

require_relative "header/split"
require_relative "header/multiple"

require_relative "header/cookie"
require_relative "header/connection"
require_relative "header/cache_control"
Expand All @@ -15,6 +16,11 @@
require_relative "header/date"
require_relative "header/priority"

require_relative "header/accept"
require_relative "header/accept_charset"
require_relative "header/accept_encoding"
require_relative "header/accept_language"

module Protocol
module HTTP
# @namespace
Expand Down Expand Up @@ -277,6 +283,12 @@ def []= key, value
"last-modified" => Header::Date,
"if-modified-since" => Header::Date,
"if-unmodified-since" => Header::Date,

# Accept headers:
"accept" => Header::Accept,
"accept-charset" => Header::AcceptCharset,
"accept-encoding" => Header::AcceptEncoding,
"accept-language" => Header::AcceptLanguage,
}.tap{|hash| hash.default = Split}

# Delete all header values for the given key, and return the merged value.
Expand Down
4 changes: 4 additions & 0 deletions releases.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Releases

## Unreleased

- Add support for parsing `accept`, `accept-charset`, `accept-encoding` and `accept-language` headers into structured values.

## v0.46.0

- Add support for `priority:` header.
Expand Down
Loading
Loading