html_postprocess.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env python3
  2. # The point of this module is to apply some transformation to the output of
  3. # mandoc, in order to improve the HTML
  4. import bleach
  5. import glob
  6. import os
  7. import re
  8. import sys
  9. from pyquery import PyQuery as pq
  10. # Loop HTML files (mandoc output)
  11. for file in glob.iglob ('man/**/*.html', recursive=True):
  12. print (file)
  13. with open (file, 'rt') as f:
  14. html = f.read ()
  15. # Replace references to other manpages with links.
  16. # Example: ls(1) -> <a href="">ls</a>(1)
  17. # regex:
  18. # - some manpages use a bold or italic name, so we match an optional <strong>
  19. # or <em> tag
  20. # - lazy match valid characters for a manpage, assign name <page>
  21. # - match optional closing </em> or </strong> tags
  22. # - match '('
  23. # - match number, then lazy match anything except a space, assign name <section>
  24. # - match ')'
  25. html = re.sub (
  26. '(?:<em>|<strong>)?(?P<page>[a-zA-Z0-9._]+?)(?:</em>|</strong>)?\((?P<section>[0-9][^\s]*?)\)',
  27. lambda match:
  28. '<span class="citerefentry">' +
  29. '<span class="refentrytitle">' +
  30. '<a href="' + match.group ('page') + '.'
  31. + match.group ('section') + '">' +
  32. match.group ('page') +
  33. '</a>' +
  34. '</span>' +
  35. '<span class="manvolnum">(' + match.group ('section') + ')</span>' +
  36. '</span>'
  37. # Only make link if the targeted html manpage exist
  38. if os.path.isfile (
  39. './man/man' + match.group ('section') + '/' + match.group ('page') + '.' +
  40. match.group ('section') + '.html')
  41. else ' ' + match.group ('page') + '(' + match.group ('section') + ')',
  42. html)
  43. # Unfortunately mandoc outputs all <tags> one below the other, so there is no way
  44. # to separate a section title to its content. This is bad because it makes it
  45. # impossible to add margin-left to the content of a section from CSS.
  46. # So, this hack is to try to fix the issue. Since all section titles have the form
  47. # <h1 class="Sh" title="Sh" id="NAME"><a class="selflink" href="#NAME">NAME</a>
  48. # and since apparently only section titles use <h1> tags, we try to wrap all
  49. # content between a </h1> and the next <h1> into a <div>.
  50. # regex:
  51. # - look for a </h1> tag (and of section title)
  52. # - match everything next
  53. # - look for the next <h1> (start of new section title)
  54. # <table class="foot"> is used to match the end of the last section
  55. """
  56. html = re.sub (
  57. '(?<=</h1>)(?P<content>.*?)(?=<h1|<table class="foot">)',
  58. lambda match: '<div class="Sh_content">' + match.group ('content') + '</div>',
  59. html, flags=re.DOTALL)
  60. """
  61. with open (file, 'wt') as f:
  62. f.write (html)