-
Notifications
You must be signed in to change notification settings - Fork 3
/
httputil.py
246 lines (220 loc) · 8.42 KB
/
httputil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#
# Generic utilities for HTTP service.
# These are not part of the core DWiki HTML view code but are still
# HTTP-connector-independant, so we can reuse them between all of
# the connector methods.
#
import urllib, urlparse
import time
import htmlresp
#
# Error processing for situations that are outside of the pages that
# DWiki serves. Because these are not supposed to happen in normal
# operation, we are very short and curt. (We cannot use htmlerr
# because that is for errors inside a DWiki context.)
#
genericErr = """<html><head><title>%d - Request Unsuccessful</title></head>
<body><h1> %d: Request Unsuccessful </h1>
<p> Your request cannot be satisfied. </p>
<p> %s </p>
</body></html>
"""
robotmsg = """You appear to be a web spider or web robot making a request
for something that web robots are not supposed to visit. In particular,
please stop crawling through links marked with <tt>rel="nofollow"</tt>. </p>
<p> If you are not a web robot, we apologize for the problem; please
make sure that your browser environment sends a valid User-Agent header
and try again."""
errorMsgs = {
"out-of-zone": "Your request is for a URL we do not serve.",
"file-not-available": "The page you requested is not available.",
"internal-error": "The server encountered an internal error while processing your request.",
"sec-error": "You appear to be trying to break this web server. Goodbye.",
"not-supported": "This server does not support that operation.",
"web-robot": robotmsg,
"disallowed": "Access not allowed",
}
def genError(what, ecode = 404):
resp = htmlresp.Response()
resp.error(genericErr % (ecode, ecode, errorMsgs[what]))
resp.code = ecode
return resp
# What is the raw path that is incomplete (lacks a final slash).
# We add the slash and return the redirector.
def redirToSlashedDir(what, reqdata):
resp = htmlresp.Response()
what = what + '/'
resp.redirect("%s%s" % (reqdata['server-url'], what))
return resp
# If path is a child of directory root, return the relative portion of
# the path; '' means the root. Otherwise, return None.
def getRelativePath(root, path):
root = root.rstrip('/')
rroot = root + '/'
# Request for the root:
if path == root or path == rroot:
return ''
# okay, something under the root?
if path.startswith(rroot):
return path[len(rroot):]
# Does not.
return None
# Because I don't care to be too paranoid, this just picks the last
# one out of a bunch of the same things.
# All hail CJ Silverio, who wrote the code that I stole this from.
def parseQueryStringView(qstr):
res = {}
view = None
for p in qstr.split("&"):
kv = p.split("=", 1)
if len(kv) != 2:
view = p
else:
key, value = kv
value = urllib.unquote_plus(value)
res[key] = value.replace("\r\n", "\n")
return (view, res)
# We actively don't want a view for this one.
def parseQueryString(qstr):
_, res = parseQueryStringView(qstr)
return res
#
# Apart from the bit where we need to know the original headers of the
# request (or at least the two we really care about), the conditional
# logic of whether we can serve a 304 is standard.
ifModSince = 'HTTP_IF_MODIFIED_SINCE'
ifNoneMatch = 'HTTP_IF_NONE_MATCH'
# Can we send a 304?
# This depends on either or both of last-modified and etags being
# present. The logic is convoluted: if both are present, both must
# match. Otherwise, the present one must match; if neither are
# there, we can't match.
# This logic is generic ... except the bit where we have to be
# dealing with the request headers.
def ifNotModified(environ, resp):
if resp.code != 200:
return False
rLM = environ.get(ifModSince, None)
rET = environ.get(ifNoneMatch, None)
aLM = resp.headers.get('Last-Modified', None)
aET = resp.headers.get('ETag', None)
# Very special hack. Tiny Tiny RSS is extremely popular and sends
# a 'If-Modified-Since' of *when it last fetched the feed*, not the
# feed timestamp itself. And no ETag (not that that would help).
# I hate it but I hate the bandwidth waste etc even more.
if resp.time_reliable and rLM and not rET and \
environ.get("HTTP_USER_AGENT").startswith("Tiny Tiny RSS/"):
try:
rts = time.strptime(rLM, '%a, %d %b %Y %H:%M:%S %Z')
except ValueError:
return False
rtime = time.mktime(rts)
if rtime > resp.lastmodified:
return True
# Anything present must match; otherwise, reject.
if (rLM and rLM != aLM) or (rET and rET != aET):
return False
# If we have matchers they match and we're good, assuming
# that we can match time at all.
if (rLM and aLM and resp.time_reliable) or (rET and aET):
return True
# Otherwise, reject.
return False
#
# Split a URL to get the query components. We can't use
# urlparse.urlparse() because it will mis-handle double
# slashes. Plus, *no dequoting?!*
def urlsplit(url):
# We are of the opinion that people who quote the '?' get to
# lose.
if '?' not in url:
return (urllib.unquote(url), '')
else:
r = url.split('?', 1)
return (urllib.unquote(r[0]), r[1])
# You might think that this was simple. You would be ... how shall
# we say ... slightly off. We assume that REQUEST_URI is present.
# The core problem is that SCRIPT_NAME is actually literally the
# name of the script, but it is not necessarily the front of the
# *URL*, after which you can find PATH_INFO.
# (In a WSGI environment, we may be being too picky. Tough.)
def urlFromEnv(env):
rq = env.get('REQUEST_URI', '')
#qi = env.get('QUERY_STRING', '')
if not rq:
# Well, I guess we fake it.
base = env.get('SCRIPT_NAME', '') + env.get('PATH_INFO', '')
else:
# We must do a full unparse, because some stupid joker may
# have fed us a URL with a fragment identifier.
# I'm not sure what we should do in this case; generate an
# internal URL that is going to be invalid?
# (PS: I don't know what I was talking about re urlparse
# and double slashes up above, back when I wrote it.)
schema, host, base, param, query, frag = urlparse.urlparse(rq)
return urllib.unquote(base)
# Extract the host from the REQUEST_URI, if it is present.
def hostFromEnv(env):
rq = env.get('REQUEST_URI', '')
if not rq:
return None
schema, host, base, param, query, frag = urlparse.urlparse(rq)
return host
# This is lame, partly because I have no real idea what format of IPv6
# addresses web servers in the real world write into $REMOTE_ADDR.
# Checking for '[' at the start is probably conservative.
def is_ipv6_addr(sip):
return ':' in sip or sip.startswith('[')
#
# Match an IP address -- usually the request source -- against a (string)
# list of IPs or tcpwrappers style IP prefixes. Returns True or False,
# depending on whether or not things matched.
# CHANGED: ipLst much actually be a netblock.IPRanges object; we match
# with 'in'.
def matchIP(sip, ipLst):
# Check for IPv6 first, because we don't work on IPv6 addresses.
# Maybe someday.
if is_ipv6_addr(sip):
return False
if not ipLst:
return False
return sip in ipLst
# We should do better.
# See http://www.peterbe.com/plog/html-entity-fixer and
# the snippy equivalent in trunk/libraries/snippy/utilities.py
# Unfortunately doing it right requires charset knowledge.
#
# Quoting & < > and " hits all of the printable characters that
# have special encodings. So we'll call it a day (for now).
# http://www.tbray.org/ongoing/When/200x/2004/01/11/PostelPilgrim
# lists "'" as a fifth character I need to quote. Goddamnit, as
# they say.
#import re
#mustquote = re.compile(r'[&<>"]')
#quoteEntities = { '&': '&', '<': '<', '>': '>', '"': """,
# "'": "'", }
#def quotehtml(hstr):
# def _quote(what):
# return quoteEntities[what.group(0)]
# return mustquote.sub(_quote, hstr)
# This is the faster way, used by among others the standard library
# XML code.
# ('I' in this is Daniel Martin, who helped Chris Siebenmann improve this.)
# I use ' instead of ' because I've found support for '
# in the past to be a bit spotty. Specifically, ' is part of the
# XML 1.0 spec, but IS NOT part of the HTML4 spec. (see the file
# HTMLspecial.ent linked from http://www.w3.org/TR/html4/sgml/intro.html)
# Fortunately, both targets respect ' as meaning "'".
# NOTE: & MUST COME FIRST.
quoteEntities = (('&', '&'), ('<', '<'), ('>', '>'),
('"', '"'), ("'", '''))
def quotehtml(hstr):
for qe, qs in quoteEntities:
hstr = hstr.replace(qe, qs)
return hstr
# This is not quite equal to what wikirend does. Different contexts.
uquoteEntities = (('&', '&'), ('"', '%22'), (' ', '%20'), ('>', '%3E'))
def quoteurl(ustr):
for qe, qs in uquoteEntities:
ustr = ustr.replace(qe, qs)
return ustr