90 lines
2.3 KiB
Python
90 lines
2.3 KiB
Python
#!/usr/bin/python3
|
|
import sys
|
|
import codecs
|
|
|
|
import http.client
|
|
import email.parser
|
|
|
|
cchardet = False
|
|
|
|
try:
|
|
import cchardet
|
|
except ImportError: # pragma: no cover
|
|
pass
|
|
|
|
def isUTF8Strict(data): # pragma: no cover - Only used when cchardet is missing.
|
|
'''
|
|
Check if all characters in a bytearray are decodable
|
|
using UTF-8.
|
|
'''
|
|
try:
|
|
decoded = data.decode('UTF-8')
|
|
except UnicodeDecodeError:
|
|
return False
|
|
else:
|
|
for ch in decoded:
|
|
if 0xD800 <= ord(ch) <= 0xDFFF:
|
|
return False
|
|
return True
|
|
|
|
def decode_headers(header_list):
|
|
'''
|
|
Decode a list of headers.
|
|
|
|
Takes a list of bytestrings, returns a list of unicode strings.
|
|
The character set for each bytestring is individually decoded.
|
|
'''
|
|
|
|
decoded_headers = []
|
|
for header in header_list:
|
|
if cchardet:
|
|
inferred = cchardet.detect(header)
|
|
if inferred and inferred['confidence'] > 0.8:
|
|
# print("Parsing headers!", header)
|
|
decoded_headers.append(header.decode(inferred['encoding']))
|
|
else:
|
|
decoded_headers.append(header.decode('iso-8859-1'))
|
|
else: # pragma: no cover
|
|
# All bytes are < 127 (e.g. ASCII)
|
|
if all([char & 0x80 == 0 for char in header]):
|
|
decoded_headers.append(header.decode("us-ascii"))
|
|
elif isUTF8Strict(header):
|
|
decoded_headers.append(header.decode("utf-8"))
|
|
else:
|
|
decoded_headers.append(header.decode('iso-8859-1'))
|
|
|
|
return decoded_headers
|
|
|
|
|
|
def parse_headers(fp, _class=http.client.HTTPMessage):
|
|
"""Parses only RFC2822 headers from a file pointer.
|
|
|
|
email Parser wants to see strings rather than bytes.
|
|
But a TextIOWrapper around self.rfile would buffer too many bytes
|
|
from the stream, bytes which we later need to read as bytes.
|
|
So we read the correct bytes here, as bytes, for email Parser
|
|
to parse.
|
|
|
|
Note: Monkey-patched version to try to more intelligently determine
|
|
header encoding
|
|
|
|
"""
|
|
headers = []
|
|
while True:
|
|
line = fp.readline(http.client._MAXLINE + 1)
|
|
if len(line) > http.client._MAXLINE:
|
|
raise http.client.LineTooLong("header line")
|
|
headers.append(line)
|
|
if len(headers) > http.client._MAXHEADERS:
|
|
raise HTTPException("got more than %d headers" % http.client._MAXHEADERS)
|
|
if line in (b'\r\n', b'\n', b''):
|
|
break
|
|
|
|
decoded_headers = decode_headers(headers)
|
|
|
|
hstring = ''.join(decoded_headers)
|
|
|
|
return email.parser.Parser(_class=_class).parsestr(hstring)
|
|
|
|
http.client.parse_headers = parse_headers
|