google_appengine/google/appengine/api/urlfetch.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""URL downloading API.

Methods defined in this module:
   Fetch(): fetchs a given URL using an HTTP GET or POST
"""


import os
import UserDict
import urllib2
import urlparse

from google.appengine.api import apiproxy_stub_map
from google.appengine.api import urlfetch_service_pb
from google.appengine.api.urlfetch_errors import *
from google.appengine.runtime import apiproxy_errors

MAX_REDIRECTS = 5

GET = 1
POST = 2
HEAD = 3
PUT = 4
DELETE = 5


_URL_STRING_MAP = {
    'GET': GET,
    'POST': POST,
    'HEAD': HEAD,
    'PUT': PUT,
    'DELETE': DELETE,
}


_VALID_METHODS = frozenset(_URL_STRING_MAP.values())


class _CaselessDict(UserDict.IterableUserDict):
  """Case insensitive dictionary.

  This class was lifted from os.py and slightly modified.
  """

  def __init__(self):
    UserDict.IterableUserDict.__init__(self)
    self.caseless_keys = {}

  def __setitem__(self, key, item):
    """Set dictionary item.

    Args:
      key: Key of new item.  Key is case insensitive, so "d['Key'] = value "
        will replace previous values set by "d['key'] = old_value".
      item: Item to store.
    """
    caseless_key = key.lower()
    if caseless_key in self.caseless_keys:
      del self.data[self.caseless_keys[caseless_key]]
    self.caseless_keys[caseless_key] = key
    self.data[key] = item

  def __getitem__(self, key):
    """Get dictionary item.

    Args:
      key: Key of item to get.  Key is case insensitive, so "d['Key']" is the
        same as "d['key']".

    Returns:
      Item associated with key.
    """
    return self.data[self.caseless_keys[key.lower()]]

  def __delitem__(self, key):
    """Remove item from dictionary.

    Args:
      key: Key of item to remove.  Key is case insensitive, so "del d['Key']" is
        the same as "del d['key']"
    """
    caseless_key = key.lower()
    del self.data[self.caseless_keys[caseless_key]]
    del self.caseless_keys[caseless_key]

  def has_key(self, key):
    """Determine if dictionary has item with specific key.

    Args:
      key: Key to check for presence.  Key is case insensitive, so
        "d.has_key('Key')" evaluates to the same value as "d.has_key('key')".

    Returns:
      True if dictionary contains key, else False.
    """
    return key.lower() in self.caseless_keys

  def __contains__(self, key):
    """Same as 'has_key', but used for 'in' operator.'"""
    return self.has_key(key)

  def get(self, key, failobj=None):
    """Get dictionary item, defaulting to another value if it does not exist.

    Args:
      key: Key of item to get.  Key is case insensitive, so "d['Key']" is the
        same as "d['key']".
      failobj: Value to return if key not in dictionary.
    """
    try:
      cased_key = self.caseless_keys[key.lower()]
    except KeyError:
      return failobj
    return self.data[cased_key]

  def update(self, dict=None, **kwargs):
    """Update dictionary using values from another dictionary and keywords.

    Args:
      dict: Dictionary to update from.
      kwargs: Keyword arguments to update from.
    """
    if dict:
      try:
        keys = dict.keys()
      except AttributeError:
        for k, v in dict:
          self[k] = v
      else:
        for k in keys:
          self[k] = dict[k]
    if kwargs:
      self.update(kwargs)

  def copy(self):
    """Make a shallow, case sensitive copy of self."""
    return dict(self)


def _is_fetching_self(url, method):
  """Checks if the fetch is for the same URL from which it originated.

  Args:
    url: str, The URL being fetched.
    method: value from _VALID_METHODS.

  Returns:
    boolean indicating whether or not it seems that the app is trying to fetch
      itself.
  """
  if (method != GET or
      "HTTP_HOST" not in os.environ or
      "PATH_INFO" not in os.environ):
    return False

  scheme, host_port, path, query, fragment = urlparse.urlsplit(url)

  if host_port == os.environ['HTTP_HOST']:
    current_path = urllib2.unquote(os.environ['PATH_INFO'])
    desired_path = urllib2.unquote(path)

    if (current_path == desired_path or
        (current_path in ('', '/') and desired_path in ('', '/'))):
      return True

  return False


def create_rpc(deadline=None, callback=None):
  """Creates an RPC object for use with the urlfetch API.

  Args:
    deadline: Optional deadline in seconds for the operation; the default
      is a system-specific deadline (typically 5 seconds).
    callback: Optional callable to invoke on completion.

  Returns:
    An apiproxy_stub_map.UserRPC object specialized for this service.
  """
  return apiproxy_stub_map.UserRPC('urlfetch', deadline, callback)


def fetch(url, payload=None, method=GET, headers={},
          allow_truncated=False, follow_redirects=True,
          deadline=None):
  """Fetches the given HTTP URL, blocking until the result is returned.

  Other optional parameters are:
     method: GET, POST, HEAD, PUT, or DELETE
     payload: POST or PUT payload (implies method is not GET, HEAD, or DELETE).
       this is ignored if the method is not POST or PUT.
     headers: dictionary of HTTP headers to send with the request
     allow_truncated: if true, truncate large responses and return them without
       error. Otherwise, ResponseTooLargeError is raised when a response is
       truncated.
     follow_redirects: if true (the default), redirects are
       transparently followed and the response (if less than 5
       redirects) contains the final destination's payload and the
       response status is 200.  You lose, however, the redirect chain
       information.  If false, you see the HTTP response yourself,
       including the 'Location' header, and redirects are not
       followed.
     deadline: deadline in seconds for the operation.

  We use a HTTP/1.1 compliant proxy to fetch the result.

  The returned data structure has the following fields:
     content: string containing the response from the server
     status_code: HTTP status code returned by the server
     headers: dictionary of headers returned by the server

  If the URL is an empty string or obviously invalid, we throw an
  urlfetch.InvalidURLError. If the server cannot be contacted, we throw a
  urlfetch.DownloadError.  Note that HTTP errors are returned as a part
  of the returned structure, so HTTP errors like 404 do not result in an
  exception.
  """
  rpc = create_rpc(deadline=deadline)
  make_fetch_call(rpc, url, payload, method, headers,
                  allow_truncated, follow_redirects)
  return rpc.get_result()


def make_fetch_call(rpc, url, payload=None, method=GET, headers={},
                    allow_truncated=False, follow_redirects=True):
  """Executes the RPC call to fetch a given HTTP URL.

  The first argument is a UserRPC instance.  See urlfetch.fetch for a
  thorough description of remaining arguments.
  """
  assert rpc.service == 'urlfetch', repr(rpc.service)
  if isinstance(method, basestring):
    method = method.upper()
  method = _URL_STRING_MAP.get(method, method)
  if method not in _VALID_METHODS:
    raise InvalidMethodError('Invalid method %s.' % str(method))

  if _is_fetching_self(url, method):
    raise InvalidURLError("App cannot fetch the same URL as the one used for "
                          "the request.")

  request = urlfetch_service_pb.URLFetchRequest()
  response = urlfetch_service_pb.URLFetchResponse()
  request.set_url(url)

  if method == GET:
    request.set_method(urlfetch_service_pb.URLFetchRequest.GET)
  elif method == POST:
    request.set_method(urlfetch_service_pb.URLFetchRequest.POST)
  elif method == HEAD:
    request.set_method(urlfetch_service_pb.URLFetchRequest.HEAD)
  elif method == PUT:
    request.set_method(urlfetch_service_pb.URLFetchRequest.PUT)
  elif method == DELETE:
    request.set_method(urlfetch_service_pb.URLFetchRequest.DELETE)

  if payload and (method == POST or method == PUT):
    request.set_payload(payload)

  for key, value in headers.iteritems():
    header_proto = request.add_header()
    header_proto.set_key(key)
    header_proto.set_value(str(value))

  request.set_followredirects(follow_redirects)

  if rpc.deadline is not None:
    request.set_deadline(rpc.deadline)

  rpc.make_call('Fetch', request, response, _get_fetch_result, allow_truncated)


def _get_fetch_result(rpc):
  """Check success, handle exceptions, and return converted RPC result.

  This method waits for the RPC if it has not yet finished, and calls the
  post-call hooks on the first invocation.

  Args:
    rpc: A UserRPC object.

  Raises:
    InvalidURLError if the url was invalid.
    DownloadError if there was a problem fetching the url.
    ResponseTooLargeError if the response was either truncated (and
      allow_truncated=False was passed to make_fetch_call()), or if it
      was too big for us to download.

  Returns:
    A _URLFetchResult object.
  """
  assert rpc.service == 'urlfetch', repr(rpc.service)
  assert rpc.method == 'Fetch', repr(rpc.method)
  try:
    rpc.check_success()
  except apiproxy_errors.ApplicationError, err:
    if (err.application_error ==
        urlfetch_service_pb.URLFetchServiceError.INVALID_URL):
      raise InvalidURLError(str(err))
    if (err.application_error ==
        urlfetch_service_pb.URLFetchServiceError.UNSPECIFIED_ERROR):
      raise DownloadError(str(err))
    if (err.application_error ==
        urlfetch_service_pb.URLFetchServiceError.FETCH_ERROR):
      raise DownloadError(str(err))
    if (err.application_error ==
        urlfetch_service_pb.URLFetchServiceError.RESPONSE_TOO_LARGE):
      raise ResponseTooLargeError(None)
    if (err.application_error ==
        urlfetch_service_pb.URLFetchServiceError.DEADLINE_EXCEEDED):
      raise DownloadError(str(err))
    raise err

  response = rpc.response
  allow_truncated = rpc.user_data
  result = _URLFetchResult(response)
  if response.contentwastruncated() and not allow_truncated:
    raise ResponseTooLargeError(result)
  return result


Fetch = fetch


class _URLFetchResult(object):
  """A Pythonic representation of our fetch response protocol buffer.
  """

  def __init__(self, response_proto):
    """Constructor.

    Args:
      response_proto: the URLFetchResponse proto buffer to wrap.
    """
    self.__pb = response_proto
    self.content = response_proto.content()
    self.status_code = response_proto.statuscode()
    self.content_was_truncated = response_proto.contentwastruncated()
    self.headers = _CaselessDict()
    for header_proto in response_proto.header_list():
      self.headers[header_proto.key()] = header_proto.value()