12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- #!/usr/bin/env python3
- # The point of this module is to apply some transformation to the output of
- # mandoc, in order to improve the HTML
- import bleach
- import glob
- import os
- import re
- import sys
- from pyquery import PyQuery as pq
- # Loop HTML files (mandoc output)
- for file in glob.iglob ('man/**/*.html', recursive=True):
-
- print (file)
-
- with open (file, 'rt') as f:
- html = f.read ()
-
- # Replace references to other manpages with links.
- # Example: ls(1) -> <a href="">ls</a>(1)
- # regex:
- # - some manpages use a bold or italic name, so we match an optional <strong>
- # or <em> tag
- # - lazy match valid characters for a manpage, assign name <page>
- # - match optional closing </em> or </strong> tags
- # - match '('
- # - match number, then lazy match anything except a space, assign name <section>
- # - match ')'
- html = re.sub (
- '(?:<em>|<strong>)?(?P<page>[a-zA-Z0-9._]+?)(?:</em>|</strong>)?\((?P<section>[0-9][^\s]*?)\)',
-
- lambda match:
- '<span class="citerefentry">' +
- '<span class="refentrytitle">' +
- '<a href="' + match.group ('page') + '.'
- + match.group ('section') + '">' +
- match.group ('page') +
- '</a>' +
- '</span>' +
- '<span class="manvolnum">(' + match.group ('section') + ')</span>' +
- '</span>'
-
- # Only make link if the targeted html manpage exist
- if os.path.isfile (
- './man/man' + match.group ('section') + '/' + match.group ('page') + '.' +
- match.group ('section') + '.html')
- else ' ' + match.group ('page') + '(' + match.group ('section') + ')',
-
- html)
-
- # Unfortunately mandoc outputs all <tags> one below the other, so there is no way
- # to separate a section title to its content. This is bad because it makes it
- # impossible to add margin-left to the content of a section from CSS.
- # So, this hack is to try to fix the issue. Since all section titles have the form
- # <h1 class="Sh" title="Sh" id="NAME"><a class="selflink" href="#NAME">NAME</a>
- # and since apparently only section titles use <h1> tags, we try to wrap all
- # content between a </h1> and the next <h1> into a <div>.
- # regex:
- # - look for a </h1> tag (and of section title)
- # - match everything next
- # - look for the next <h1> (start of new section title)
- # <table class="foot"> is used to match the end of the last section
- """
- html = re.sub (
- '(?<=</h1>)(?P<content>.*?)(?=<h1|<table class="foot">)',
-
- lambda match: '<div class="Sh_content">' + match.group ('content') + '</div>',
-
- html, flags=re.DOTALL)
- """
-
- with open (file, 'wt') as f:
- f.write (html)
|