| # Copyright (c) 2011 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """HTTP Client classes for Recall server. |
| |
| This module should not be imported directly, instead the public classes |
| are imported directly into the top-level recall package. |
| """ |
| |
| __all__ = ["HTTPConnection", "HTTPSConnection", "HTTPRequest", "HTTPResponse", |
| "HTTPClient", "HTTPMiddleware", "ArchivingHTTPClient"] |
| |
| import httplib |
| import logging |
| import struct |
| import threading |
| import time |
| import zlib |
| |
| |
| class HTTPResponse(httplib.HTTPResponse): |
| """Recordable HTTP Response. |
| |
| Subclasses the httplib.HTTPResponse class to add the ability to record |
| the data received from the server, the delay between chunks, modify data |
| recevied and pickle for later playback. |
| """ |
| |
| logger = logging.getLogger("HTTPResponse") |
| |
| def __init__(self, *args, **kwds): |
| httplib.HTTPResponse.__init__(self, *args, **kwds) |
| self._mutate_functions = [] |
| self._decompressor = None |
| self._compressor = None |
| |
| def __getstate__(self): |
| state = self.__dict__.copy() |
| if 'fp' in state: |
| del state['fp'] |
| del state['_mutate_functions'] |
| del state['_decompressor'] |
| del state['_compressor'] |
| return state |
| |
| def __setstate__(self, state): |
| self.__dict__.update(state) |
| |
| self._mutate_functions = [] |
| self._decompressor = None |
| self._compressor = None |
| |
| @property |
| def headers(self): |
| """List of (header, value) tuples. |
| |
| Unlike the superclass, this doesn't merge duplicate headers since that |
| breaks cookies. |
| """ |
| if self.msg is None: |
| raise httplib.ResponseNotReady() |
| |
| self._headers = [] |
| for line in self.msg.headers: |
| sep = line.index(':') |
| header = line[:sep].rstrip() |
| value = line[sep+1:].lstrip().rstrip('\r\n') |
| self._headers.append((header, value)) |
| |
| return self._headers |
| |
| def _read_status(self): |
| """Read status line from server. |
| |
| Wrapped to set the time we received the status line, which gets converted |
| to a delay by HTTPConnection.getresponse() |
| """ |
| ret = httplib.HTTPResponse._read_status(self) |
| self._status_receive_time = time.time() |
| return ret |
| |
| def AddMutateFunction(self, function): |
| """Add a function to mutate data chunks. |
| |
| Must be called before the chunks property is accessed and before any |
| headers are sent. |
| |
| Performing mutation on an HTTP Request drops the Content-Length |
| header from the request, requiring the server to generate it if |
| necessary. It also enforces on-the-fly decompression and |
| compression of the incoming chunks, |
| """ |
| self._mutate_functions.append(function) |
| # Mutate functions means the length of the content may change; |
| # server must recalculate on the fly |
| del self.msg['Content-Length'] |
| |
| # It also means we have to decompress and compress again after |
| content_encoding = self.getheader('Content-Encoding') |
| if content_encoding in ('gzip', 'deflate'): |
| self.logger.debug("Will decompresss data in order to mutate") |
| if content_encoding == 'gzip': |
| self._decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS) |
| self._compressor = HTTPGzipCompressor() |
| elif content_encoding == 'deflate': |
| self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS) |
| self._compressor = zlib.compressobj() |
| |
| def _RecordChunk(self, chunk, last=False): |
| """Record and mutate chunk received from the server. |
| |
| This method handles the decompression and recompression of chunks. |
| """ |
| delay = time.time() - self.start_time |
| if self._decompressor: |
| chunk = self._decompressor.decompress(chunk) |
| for mutate_function in self._mutate_functions: |
| chunk = mutate_function(chunk) |
| if self._compressor: |
| chunk = self._compressor.compress(chunk) |
| if last: |
| chunk += self._compressor.flush(zlib.Z_FULL_FLUSH) |
| chunk += self._compressor.flush() |
| else: |
| chunk += self._compressor.flush(zlib.Z_SYNC_FLUSH) |
| |
| self._chunks.append((delay, chunk)) |
| return chunk |
| |
| @property |
| def chunks(self): |
| """Read data chunks from server. |
| |
| This yields each chunk, including the final empty chunk in the case of |
| chunked transfer-encoding and may sleep between calls to simulate the |
| delay between chunk receive times. |
| """ |
| try: |
| # TODO(keybuk): this should probably record the time of the last call |
| # rather than just the delay |
| last_delay = 0.0 |
| for delay, chunk in self._chunks: |
| time.sleep(delay - last_delay) |
| last_delay = delay |
| yield chunk |
| except AttributeError: |
| self._chunks = [] |
| |
| # (keybuk) non-chunked we just read the entire document as one chunk; |
| # this means that for streaming, there's a bit of a wait the first time |
| # since it has to reach the server first, but not when playing back |
| if not self.chunked: |
| yield self._RecordChunk(self.read(), last=True) |
| return |
| |
| # (keybuk) this code is basically _read_chunked() from the superclass, |
| # simplified to always read everything, and to yield the final empty |
| # chunk since we need to send it to the client ourselves |
| chunk_len = None |
| while chunk_len != 0: |
| line = self.fp.readline() |
| i = line.find(';') |
| if i >= 0: |
| line = line[:i] # strip chunk-extensions |
| try: |
| chunk_len = int(line, 16) |
| except ValueError: |
| # close the connection as protocol synchronisation is |
| # probably lost |
| self.close() |
| raise httplib.IncompleteRead(''.join(value)) |
| |
| # (keybuk) we always want to yield the final empty chunk since we |
| # need to send it to the client anyway |
| yield self._RecordChunk(self._safe_read(chunk_len), last=(chunk_len==0)) |
| self._safe_read(2) # toss the CRLF at the end of the chunk |
| |
| # we read everything; close the "file" |
| self.close() |
| |
| |
| class HTTPGzipCompressor(object): |
| """Wrapper around zlib.Decompress to handle HTTP gzip compression. |
| |
| We can't use the Python gzip module for this since it only encapsulates |
| files, and we can't subclass zlib.Decompress itself because that object |
| is "hidden". |
| |
| This wrapper only implements the compress() and flush() methods, so |
| is equivalent to Python 2.4-era zlib |
| """ |
| |
| # see gzip.py _write_gzip_header() in the Python distribution |
| GZIP_HEADER = ( |
| '\037\213' # magic header |
| '\010' # compression method |
| '\000' # flags (none) |
| '\000\000\000\000' # packed time (use zero) |
| '\002' |
| '\377') |
| |
| def __init__(self): |
| # see gzip.py in the Python distribution |
| self._compressor = zlib.compressobj( |
| 6, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0) |
| self._crc = zlib.crc32('') & 0xffffffffL |
| self._size = 0 |
| self._sent_header = False |
| |
| def compress(self, string): |
| """Compress string, returning at least part of it. |
| |
| Wraps zlib.Decompress.compress() prepending the gzip header for the |
| first chunk, and keeping track of the CRC and size of the compressed |
| data. |
| |
| Args: |
| string: string to be compressed; |
| Returns: |
| string of compressed data forming at least part of string |
| """ |
| chunk = self._compressor.compress(string) |
| if not self._sent_header: |
| chunk = self.GZIP_HEADER + chunk |
| self._sent_header = True |
| |
| self._crc = zlib.crc32(string, self._crc) & 0xffffffffL |
| self._size += len(string) |
| return chunk |
| |
| def flush(self, mode=zlib.Z_FINISH): |
| """Flush all pending input. |
| |
| Wraps zlib.Decompress.flush(), when called without arguments or |
| with zlib.Z_FINISH, appends the CRC and size of the compressed data |
| as required for gzip files. |
| |
| Args: |
| mode: see zlib.py documentation |
| Returns: |
| string of remaining compressed data; |
| """ |
| chunk = self._compressor.flush(mode) |
| if mode == zlib.Z_FINISH: |
| chunk += struct.pack('<LL', long(self._crc), long(self._size)) |
| return chunk |
| |
| |
| class HTTPConnectionMixIn: # HTTPConnection is not an object |
| response_class = HTTPResponse |
| |
| def getresponse(self): |
| """Get the response from the server. |
| |
| Wraps httplib.HTTPConnection to set the response's start_time variable |
| used to caclulate the delay between chunks being received, and ensures |
| that status_delay is the time it took to receive the status code line. |
| """ |
| start_time = time.time() |
| response = httplib.HTTPConnection.getresponse(self) |
| response.start_time = start_time |
| # calculate here since we can't set start_time before getresponse |
| response.status_delay = response._status_receive_time - start_time |
| return response |
| |
| class HTTPConnection(HTTPConnectionMixIn, httplib.HTTPConnection): |
| pass |
| |
| class HTTPSConnection(HTTPConnectionMixIn, httplib.HTTPSConnection): |
| pass |
| |
| |
| class HTTPRequest(object): |
| """Recordable HTTP Request. |
| |
| Companion request object for HTTPResponse that is picklable and hashable |
| so it can be used as a key to find the appropriate HTTPResponse object; |
| implements smart matching since some headers change, and some order |
| changes. |
| |
| Because of this smart matching, one HTTPRequest object may in fact match |
| several recorded ones. You should therefore use an indirect lookup where |
| hashing an HTTPRequest in fact yields an array of matching requests and |
| their responses, and compare the request you're looking for with those |
| using MatchScore() |
| """ |
| |
| logger = logging.getLogger("HTTPRequest") |
| |
| unmatchable_headers = [ |
| 'Accept', |
| 'Accept-Charset', |
| 'Accept-Encoding', |
| 'Accept-Language', |
| 'Cache-Control', |
| 'Cookie', # special matching |
| 'Connection', |
| 'Keep-Alive', |
| 'Pragma', |
| 'Referer', # special matching |
| 'User-Agent', |
| ] |
| |
| def __init__(self, host, command, path, headers=[], body=None, ssl=False): |
| self.host = host |
| self.command = command |
| self.path = path |
| self.headers = headers |
| self.body = body |
| self.ssl = ssl |
| |
| self._match_headers = self._MatchHeaders(self.headers) |
| |
| def __str__(self): |
| return "%s %s://%s%s" % (self.command, |
| self.ssl and 'https' or 'http', |
| self.host, self.path) |
| |
| def __repr__(self): |
| return "<%s: %s%r %s %r>" % (self.__class__.__name__, |
| self.ssl and '(SSL) ' or '', |
| self.host, self.command, self.path) |
| |
| def __hash__(self): |
| return hash(repr((self.host, self.command, self.path, self._match_headers, |
| self.body, self.ssl))) |
| |
| def __eq__(self, other): |
| if other.host != self.host: |
| return False |
| elif other.command != self.command: |
| return False |
| elif other.ssl != self.ssl: |
| return False |
| elif other.path != self.path: |
| return False |
| elif other.body != self.body: |
| return False |
| elif other._match_headers != self._match_headers: |
| return False |
| else: |
| return True |
| |
| def __getstate__(self): |
| state = self.__dict__.copy() |
| del state['_match_headers'] |
| return state |
| |
| def __setstate__(self, state): |
| self.__dict__.update(state) |
| self._match_headers = self._MatchHeaders(state['headers']) |
| |
| @classmethod |
| def _MatchHeaders(cls, headers): |
| match_headers = set() |
| for header, value in headers: |
| header = header.title() |
| if header in cls.unmatchable_headers: |
| continue |
| match_headers.add((header, value)) |
| return match_headers |
| |
| @property |
| def _cookies(self): |
| """Values of all cookies present in the request. |
| |
| Returns all values from all Cookie headers, including those that provide |
| multiple values. |
| """ |
| cookies = set() |
| for header, value in self.headers: |
| if header.title() == 'Cookie': |
| for cookie in value.split(';'): |
| cookies.add(cookie.strip()) |
| return cookies |
| |
| def getheader(self, name, default=None): |
| """Retrieve value of header.""" |
| for header, value in self.headers: |
| if header.title() == name.title(): |
| return value |
| else: |
| return default |
| |
| def MatchScore(self, other): |
| """Compare request with another and return a match score. |
| |
| The hash of an HTTPRequest object may match multiple other HTTPRequest |
| objects, in which case you should calculate the MatchScore of the request |
| you're seeking against those present and pick the highest one. |
| """ |
| # These numbers are randomly picked and seem to work ok for now |
| score = len(self._cookies.intersection(other._cookies)) * 10 |
| if self.getheader('referer') == other.getheader('referer'): |
| score += 1000 |
| return score |
| |
| |
| class HTTPClient(object): |
| """Generic HTTP Client. |
| |
| This class implements an HTTP Client that fetches the results using |
| the Python httplib library. HTTP Client objects are picklable. |
| |
| Example: |
| client = HTTPClient() |
| request = HTTPRequest('www.google.com', 'GET', '/') |
| response = client(request) |
| """ |
| |
| def __call__(self, request): |
| """Lookup the request. |
| |
| Args: |
| request: HTTPRequest to lookup. |
| |
| Returns: |
| HTTPResponse reply, which may include an error response. |
| """ |
| if request.ssl: |
| connection = HTTPSConnection(request.host) |
| else: |
| connection = HTTPConnection(request.host) |
| |
| connection.putrequest(request.command, request.path, |
| skip_host=True, |
| skip_accept_encoding=True) |
| |
| send_host = True |
| send_accept_encoding = True |
| |
| for header, value in request.headers: |
| if header.title() == 'Host': |
| send_host = False |
| elif header.title() == 'Accept-Encoding': |
| send_accept_encoding = False |
| |
| connection.putheader(header, value) |
| |
| if send_host: |
| connection.putheader('Host', request.host) |
| if send_accept_encoding: |
| connection.putheader('Accept-Encoding', '') |
| |
| connection.endheaders() |
| |
| if request.body is not None: |
| connection.send(request.body) |
| |
| return connection.getresponse() |
| |
| |
| class HTTPMiddleware(object): |
| """Base class for HTTP Client middleware. |
| |
| This class is a base class for HTTPClient-compatible classes that |
| accept a HTTPClient argument to their constructor and surround it |
| with their own processing. |
| |
| HTTP Middleware objects are picklable. |
| |
| When creating your subclass, if it will work without a HTTP Client |
| you can set the client_optional class member to True; an instance |
| may raise KeyError in the case where it cannot proceed further |
| without one. |
| """ |
| client_optional = False |
| |
| def __init__(self, http_client=HTTPClient()): |
| """Create the client middleware instance. |
| |
| Args: |
| http_client: HTTP Client object to wrap, defaults to plain HTTPClient() |
| and may be None if client_optional is True for the class. |
| """ |
| self.http_client = http_client |
| |
| @property |
| def http_client(self): |
| return self._http_client |
| |
| @http_client.setter |
| def http_client(self, http_client): |
| assert http_client is not None or self.client_optional |
| self._http_client = http_client |
| |
| @http_client.deleter |
| def http_client(self): |
| assert self.client_optional |
| self._http_client = None |
| |
| def __call__(self, request): |
| """Lookup the request. |
| |
| Subclasses should override this function replacing it with their |
| own, there is no need to call the superclass version. |
| |
| Args: |
| request: HTTPRequest to lookup. |
| |
| Returns: |
| HTTPResponse reply, which may include an error response. |
| """ |
| if not self._http_client: |
| raise KeyError |
| |
| response = self._http_client(request) |
| return response |
| |
| |
| class ArchivingHTTPClient(HTTPMiddleware): |
| """Archiving HTTP Client middleware. |
| |
| This HTTP Client middleware wraps an HTTP Client and archives all of its |
| responses. If a request is found in the archive, that is returned in |
| place of using the HTTP Client given to the constructor. |
| |
| The archive may be initialised with None as the HTTP Client in which |
| case KeyError will be raised if the request is not found in the archive. |
| |
| Matching is done using HTTPRequest smart matching. |
| """ |
| client_optional = True |
| |
| logger = logging.getLogger("ArchivingHTTPClient") |
| |
| def __init__(self, http_client=HTTPClient()): |
| super(ArchivingHTTPClient, self).__init__(http_client) |
| |
| self._responses = {} |
| self._lock = threading.Lock() |
| |
| def __getstate__(self): |
| state = self.__dict__.copy() |
| del state['_lock'] |
| return state |
| |
| def __setstate__(self, state): |
| self.__dict__.update(state) |
| |
| self._lock = threading.Lock() |
| |
| def __call__(self, request): |
| """Lookup the request. |
| |
| Returns an archived response if one exists, otherwise uses the next |
| HTTP Client in the middleware stack to resolve it, throwing KeyError |
| if no further client exists. |
| |
| Args: |
| request: HTTPRequest to lookup. |
| |
| Returns: |
| HTTPResponse reply, which may include an error response. |
| """ |
| try: |
| with self._lock: |
| best_response, best_score = None, 0 |
| for original_request, response in self._responses[request]: |
| score = request.MatchScore(original_request) |
| if score >= best_score: |
| best_response, best_score = response, score |
| |
| self.logger.info("HIT %s", request) |
| return best_response |
| except KeyError: |
| self.logger.info("MISS %s", request) |
| if not self._http_client: |
| raise |
| |
| response = self._http_client(request) |
| with self._lock: |
| try: |
| self._responses[request].append((request, response)) |
| except KeyError: |
| self._responses[request] = [ (request, response) ] |
| return response |