-booruScraper/util/WebRequest/HeaderParseMonkeyPatch.py

90 lines
2.3 KiB
Python

#!/usr/bin/python3
import sys
import codecs
import http.client
import email.parser
cchardet = False
try:
import cchardet
except ImportError: # pragma: no cover
pass
def isUTF8Strict(data): # pragma: no cover - Only used when cchardet is missing.
'''
Check if all characters in a bytearray are decodable
using UTF-8.
'''
try:
decoded = data.decode('UTF-8')
except UnicodeDecodeError:
return False
else:
for ch in decoded:
if 0xD800 <= ord(ch) <= 0xDFFF:
return False
return True
def decode_headers(header_list):
'''
Decode a list of headers.
Takes a list of bytestrings, returns a list of unicode strings.
The character set for each bytestring is individually decoded.
'''
decoded_headers = []
for header in header_list:
if cchardet:
inferred = cchardet.detect(header)
if inferred and inferred['confidence'] > 0.8:
# print("Parsing headers!", header)
decoded_headers.append(header.decode(inferred['encoding']))
else:
decoded_headers.append(header.decode('iso-8859-1'))
else: # pragma: no cover
# All bytes are < 127 (e.g. ASCII)
if all([char & 0x80 == 0 for char in header]):
decoded_headers.append(header.decode("us-ascii"))
elif isUTF8Strict(header):
decoded_headers.append(header.decode("utf-8"))
else:
decoded_headers.append(header.decode('iso-8859-1'))
return decoded_headers
def parse_headers(fp, _class=http.client.HTTPMessage):
"""Parses only RFC2822 headers from a file pointer.
email Parser wants to see strings rather than bytes.
But a TextIOWrapper around self.rfile would buffer too many bytes
from the stream, bytes which we later need to read as bytes.
So we read the correct bytes here, as bytes, for email Parser
to parse.
Note: Monkey-patched version to try to more intelligently determine
header encoding
"""
headers = []
while True:
line = fp.readline(http.client._MAXLINE + 1)
if len(line) > http.client._MAXLINE:
raise http.client.LineTooLong("header line")
headers.append(line)
if len(headers) > http.client._MAXHEADERS:
raise HTTPException("got more than %d headers" % http.client._MAXHEADERS)
if line in (b'\r\n', b'\n', b''):
break
decoded_headers = decode_headers(headers)
hstring = ''.join(decoded_headers)
return email.parser.Parser(_class=_class).parsestr(hstring)
http.client.parse_headers = parse_headers