mirror.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. #!/usr/bin/env python
  2. __author__ = "alimiracle"
  3. import datetime
  4. import hashlib
  5. import logging
  6. import pickle
  7. import re
  8. import time
  9. import urllib
  10. import wsgiref.handlers
  11. from google.appengine.api import memcache
  12. from google.appengine.api import urlfetch
  13. from google.appengine.ext import db
  14. from google.appengine.ext import webapp
  15. from google.appengine.ext.webapp import template
  16. from google.appengine.runtime import apiproxy_errors
  17. import transform_content
  18. ################################################################################
  19. DEBUG = False
  20. EXPIRATION_DELTA_SECONDS = 3600
  21. EXPIRATION_RECENT_URLS_SECONDS = 90
  22. ## DEBUG = True
  23. ## EXPIRATION_DELTA_SECONDS = 10
  24. ## EXPIRATION_RECENT_URLS_SECONDS = 1
  25. HTTP_PREFIX = "http://"
  26. HTTPS_PREFIX = "http://"
  27. IGNORE_HEADERS = frozenset([
  28. 'set-cookie',
  29. 'expires',
  30. 'cache-control',
  31. # Ignore hop-by-hop headers
  32. 'connection',
  33. 'keep-alive',
  34. 'proxy-authenticate',
  35. 'proxy-authorization',
  36. 'te',
  37. 'trailers',
  38. 'transfer-encoding',
  39. 'upgrade',
  40. ])
  41. TRANSFORMED_CONTENT_TYPES = frozenset([
  42. "text/html",
  43. "text/css",
  44. ])
  45. MIRROR_HOSTS = frozenset([
  46. 'mirrorr.com',
  47. 'mirrorrr.com',
  48. 'www.mirrorr.com',
  49. 'www.mirrorrr.com',
  50. 'www1.mirrorrr.com',
  51. 'www2.mirrorrr.com',
  52. 'www3.mirrorrr.com',
  53. ])
  54. MAX_CONTENT_SIZE = 10 ** 6
  55. MAX_URL_DISPLAY_LENGTH = 50
  56. ################################################################################
  57. def get_url_key_name(url):
  58. url_hash = hashlib.sha256()
  59. url_hash.update(url)
  60. return "hash_" + url_hash.hexdigest()
  61. ################################################################################
  62. class EntryPoint(db.Model):
  63. translated_address = db.TextProperty(required=True)
  64. last_updated = db.DateTimeProperty(auto_now=True)
  65. display_address = db.TextProperty()
  66. class MirroredContent(object):
  67. def __init__(self, original_address, translated_address,
  68. status, headers, data, base_url):
  69. self.original_address = original_address
  70. self.translated_address = translated_address
  71. self.status = status
  72. self.headers = headers
  73. self.data = data
  74. self.base_url = base_url
  75. @staticmethod
  76. def get_by_key_name(key_name):
  77. return memcache.get(key_name)
  78. @staticmethod
  79. def fetch_and_store(key_name, base_url, translated_address, mirrored_url):
  80. """Fetch and cache a page.
  81. Args:
  82. key_name: Hash to use to store the cached page.
  83. base_url: The hostname of the page that's being mirrored.
  84. translated_address: The URL of the mirrored page on this site.
  85. mirrored_url: The URL of the original page. Hostname should match
  86. the base_url.
  87. Returns:
  88. A new MirroredContent object, if the page was successfully retrieved.
  89. None if any errors occurred or the content could not be retrieved.
  90. """
  91. # Check for the X-Mirrorrr header to ignore potential loops.
  92. if base_url in MIRROR_HOSTS:
  93. logging.warning('Encountered recursive request for "%s"; ignoring',
  94. mirrored_url)
  95. return None
  96. logging.debug("Fetching '%s'", mirrored_url)
  97. try:
  98. response = urlfetch.fetch(mirrored_url)
  99. except (urlfetch.Error, apiproxy_errors.Error):
  100. logging.exception("Could not fetch URL")
  101. return None
  102. adjusted_headers = {}
  103. for key, value in response.headers.iteritems():
  104. adjusted_key = key.lower()
  105. if adjusted_key not in IGNORE_HEADERS:
  106. adjusted_headers[adjusted_key] = value
  107. content = response.content
  108. page_content_type = adjusted_headers.get("content-type", "")
  109. for content_type in TRANSFORMED_CONTENT_TYPES:
  110. # Startswith() because there could be a 'charset=UTF-8' in the header.
  111. if page_content_type.startswith(content_type):
  112. content = transform_content.TransformContent(base_url, mirrored_url,
  113. content)
  114. break
  115. # If the transformed content is over 1MB, truncate it (yikes!)
  116. if len(content) > MAX_CONTENT_SIZE:
  117. logging.warning('Content is over 1MB; truncating')
  118. content = content[:MAX_CONTENT_SIZE]
  119. new_content = MirroredContent(
  120. base_url=base_url,
  121. original_address=mirrored_url,
  122. translated_address=translated_address,
  123. status=response.status_code,
  124. headers=adjusted_headers,
  125. data=content)
  126. if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS):
  127. logging.error('memcache.add failed: key_name = "%s", '
  128. 'original_url = "%s"', key_name, mirrored_url)
  129. return new_content
  130. ################################################################################
  131. class BaseHandler(webapp.RequestHandler):
  132. def get_relative_url(self):
  133. slash = self.request.url.find("/", len(self.request.scheme + "://"))
  134. if slash == -1:
  135. return "/"
  136. return self.request.url[slash:]
  137. class HomeHandler(BaseHandler):
  138. def get(self):
  139. # Handle the input form to redirect the user to a relative url
  140. form_url = self.request.get("url")
  141. if form_url:
  142. # Accept URLs that still have a leading 'http://'
  143. inputted_url = urllib.unquote(form_url)
  144. if inputted_url.startswith(HTTP_PREFIX):
  145. inputted_url = inputted_url[len(HTTP_PREFIX):]
  146. return self.redirect("/" + inputted_url)
  147. latest_urls = memcache.get('latest_urls')
  148. if latest_urls is None:
  149. latest_urls = EntryPoint.gql("ORDER BY last_updated DESC").fetch(25)
  150. # Generate a display address that truncates the URL, adds an ellipsis.
  151. # This is never actually saved in the Datastore.
  152. for entry_point in latest_urls:
  153. entry_point.display_address = \
  154. entry_point.translated_address[:MAX_URL_DISPLAY_LENGTH]
  155. if len(entry_point.display_address) == MAX_URL_DISPLAY_LENGTH:
  156. entry_point.display_address += '...'
  157. if not memcache.add('latest_urls', latest_urls,
  158. time=EXPIRATION_RECENT_URLS_SECONDS):
  159. logging.error('memcache.add failed: latest_urls')
  160. # Do this dictionary construction here, to decouple presentation from
  161. # how we store data.
  162. secure_url = None
  163. if self.request.scheme == "http":
  164. secure_url = "https://mirrorrr.appspot.com"
  165. context = {
  166. "latest_urls": latest_urls,
  167. "secure_url": secure_url,
  168. }
  169. self.response.out.write(template.render("main.html", context))
  170. class MirrorHandler(BaseHandler):
  171. def get(self, base_url):
  172. assert base_url
  173. # Log the user-agent and referrer, to see who is linking to us.
  174. logging.debug('User-Agent = "%s", Referrer = "%s"',
  175. self.request.user_agent,
  176. self.request.referer)
  177. logging.debug('Base_url = "%s", url = "%s"', base_url, self.request.url)
  178. translated_address = self.get_relative_url()[1:] # remove leading /
  179. mirrored_url = HTTP_PREFIX + translated_address
  180. # Use sha256 hash instead of mirrored url for the key name, since key
  181. # names can only be 500 bytes in length; URLs may be up to 2KB.
  182. key_name = get_url_key_name(mirrored_url)
  183. logging.info("Handling request for '%s' = '%s'", mirrored_url, key_name)
  184. content = MirroredContent.get_by_key_name(key_name)
  185. cache_miss = False
  186. if content is None:
  187. logging.debug("Cache miss")
  188. cache_miss = True
  189. content = MirroredContent.fetch_and_store(key_name, base_url,
  190. translated_address,
  191. mirrored_url)
  192. if content is None:
  193. return self.error(404)
  194. # Store the entry point down here, once we know the request is good and
  195. # there has been a cache miss (i.e., the page expired). If the referrer
  196. # wasn't local, or it was '/', then this is an entry point.
  197. if (cache_miss and
  198. 'Googlebot' not in self.request.user_agent and
  199. 'Slurp' not in self.request.user_agent and
  200. (not self.request.referer.startswith(self.request.host_url) or
  201. self.request.referer == self.request.host_url + "/")):
  202. # Ignore favicons as entry points; they're a common browser fetch on
  203. # every request for a new site that we need to special case them here.
  204. if not self.request.url.endswith("favicon.ico"):
  205. logging.info("Inserting new entry point")
  206. entry_point = EntryPoint(
  207. key_name=key_name,
  208. translated_address=translated_address)
  209. try:
  210. entry_point.put()
  211. except (db.Error, apiproxy_errors.Error):
  212. logging.exception("Could not insert EntryPoint")
  213. for key, value in content.headers.iteritems():
  214. self.response.headers[key] = value
  215. if not DEBUG:
  216. self.response.headers['cache-control'] = \
  217. 'max-age=%d' % EXPIRATION_DELTA_SECONDS
  218. self.response.out.write(content.data)
  219. app = webapp.WSGIApplication([
  220. (r"/", HomeHandler),
  221. (r"/main", HomeHandler),
  222. (r"/([^/]+).*", MirrorHandler)
  223. ], debug=DEBUG)
  224. def main():
  225. wsgiref.handlers.CGIHandler().run(app)
  226. if __name__ == "__main__":
  227. main()