toys/brhute-py/contrib/httplib_pipelining_orig.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

## {{{ http://code.activestate.com/recipes/576673/ (r5)
from httplib import HTTPConnection, _CS_IDLE
import urlparse

def pipeline(domain,pages,max_out_bound=4,debuglevel=0):
    pagecount = len(pages)
    conn = HTTPConnection(domain)
    conn.set_debuglevel(debuglevel)
    respobjs = [None]*pagecount
    finished = [False]*pagecount
    data = [None]*pagecount
    headers = {'Host':domain,'Content-Length':0,'Connection':'Keep-Alive'}

    while not all(finished):
        # Send 
        out_bound = 0
        for i,page in enumerate(pages):
            if out_bound >= max_out_bound:
                break
            elif page and not finished[i] and respobjs[i] is None:
                if debuglevel > 0:
                    print 'Sending request for %r...' % (page,)
                conn._HTTPConnection__state = _CS_IDLE # FU private variable!
                conn.request("GET", page, None, headers)
                respobjs[i] = conn.response_class(conn.sock, strict=conn.strict, method=conn._method)
                out_bound += 1
        # Try to read a response
        for i,resp in enumerate(respobjs):
            if resp is None:
                continue
            if debuglevel > 0:
                print 'Retrieving %r...' % (pages[i],)
            out_bound -= 1
            skip_read = False
            resp.begin()
            if debuglevel > 0:
                print '    %d %s' % (resp.status, resp.reason)
            if 200 <= resp.status < 300:
                # Ok
                data[i] = resp.read()
                cookie = resp.getheader('Set-Cookie')
                if cookie is not None:
                    headers['Cookie'] = cookie
                skip_read = True
                finished[i] = True
                respobjs[i] = None
            elif 300 <= resp.status < 400:
                # Redirect
                loc = resp.getheader('Location')
                respobjs[i] = None
                parsed = loc and urlparse.urlparse(loc)
                if not parsed:
                    # Missing or empty location header
                    data[i] = (resp.status, resp.reason)
                    finished[i] = True
                elif parsed.netloc != '' and parsed.netloc != host:
                    # Redirect to another host
                    data[i] = (resp.status, resp.reason, loc)
                    finished[i] = True
                else:
                    path = urlparse.urlunparse(parsed._replace(scheme='',netloc='',fragment=''))
                    if debuglevel > 0:
                        print '  Updated %r to %r' % (pages[i],path)
                    pages[i] = path
            elif resp.status >= 400:
                # Failed
                data[i] = (resp.status, resp.reason)
                finished[i] = True
                respobjs[i] = None
            if resp.will_close:
                # Connection (will be) closed, need to resend
                conn.close()
                if debuglevel > 0:
                    print '  Connection closed'
                for j,f in enumerate(finished):
                    if not f and respobj[j] is not None:
                        if debuglevel > 0:
                            print '  Discarding out-bound request for %r' % (pages[j],)
                        respobj[j] = None
                break
            elif not skip_read:
                resp.read() # read any data
            if any(not f and respobjs[j] is None for j,f in enumerate(finished)):
                # Send another pending request
                break
        else:
            break # All respobjs are None?
    return data

if __name__ == '__main__':
    domain = 'en.wikipedia.org'
    pages = ('/wiki/HTTP_pipelining', '/wiki/HTTP', '/wiki/HTTP_persistent_connection')
    data = pipeline(domain,pages,max_out_bound=2,debuglevel=1)
    for i,page in enumerate(data):
        print
        print '==== Page %r ====' % (pages[i],)
        print page[:512]
## end of http://code.activestate.com/recipes/576673/ }}}