WebArchiver.hs 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. {-| Scans page of Markdown looking for http links. When it finds them, it submits them
  2. to webcitation.org / https://secure.wikimedia.org/wikipedia/en/wiki/WebCite
  3. (It will also submit them to Alexa (the source for the Internet Archive), but Alexa says that
  4. its bots take weeks to visit and may not ever.)
  5. This module employs the archiver daemon <http://hackage.haskell.org/package/archiver> as a library; `cabal install archiver` will install it.
  6. Limitations:
  7. * Only parses Markdown, not ReST or any other format; this is because 'readMarkdown'
  8. is hardwired into it.
  9. * No rate limitation or choking; will fire off all requests as fast as possible.
  10. If pages have more than 20 external links or so, this may result in your IP being temporarily
  11. banned by WebCite. To avoid this, you can use WebArchiverBot.hs instead, which will parse & dump
  12. URLs into a file processed by the archiver daemon (which *is* rate-limited).
  13. By: Gwern Branwen; placed in the public domain -}
  14. module WebArchiver (plugin) where
  15. import Control.Concurrent (forkIO)
  16. import Network.URL.Archiver as A (checkArchive)
  17. import Network.Gitit.Interface (askUser, bottomUpM, liftIO, uEmail, Plugin(PreCommitTransform), Inline(Link))
  18. import Text.Pandoc (defaultParserState, readMarkdown)
  19. plugin :: Plugin
  20. plugin = PreCommitTransform archivePage
  21. -- archivePage :: String -> ReaderT PluginData (StateT Context IO) String
  22. archivePage x = do mbUser <- askUser
  23. let email = case mbUser of
  24. Nothing -> "nobody@mailinator.com"
  25. Just u -> uEmail u
  26. let p = readMarkdown defaultParserState x
  27. -- force evaluation and archiving side-effects
  28. _p' <- liftIO $ bottomUpM (archiveLinks email) p
  29. return x -- note: this is read-only - don't actually change page!
  30. archiveLinks :: String -> Inline -> IO Inline
  31. archiveLinks e x@(Link _ (uln, _)) = forkIO (A.checkArchive e uln) >> return x
  32. archiveLinks _ x = return x