paone
/
blog


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
							#!/usr/bin/env perl

#-------------------------------------------------------------------------------
# Automatically generate certain parts of the website (the index, page
# containing all of the posts, etc) and add boiler plate to other written posts
# like TOC  generation, general metadata and the navigation menu.
# 
# TODO:
# 	- Add proper error handling
#-------------------------------------------------------------------------------

use v5.40;
use open qw(:std :encoding(UTF-8));
use Unicode::UTF8 qw(decode_utf8);

use Getopt::Long qw(:config no_ignore_case bundling);
use File::Find;
use File::Basename;
use Time::Piece;

# Collapses multiline headings into a single line
sub collapse_headings {
	my @lines = @_;
	my @collapsed;
	my $heading;
	my $found = 0;
	
	foreach (@lines) {
		$found = 1 if /<h\d>/;
		
		if ($found) {
			$heading .= $_ 
		} else {
			push(@collapsed, $_);
		}
		
		if (/<\/h\d>/) {
			$heading =~ s/\n/ /;
			push(@collapsed, $heading);
			$heading = '';
			$found = 0;
		}
	}
	return @collapsed;
}

#-------------------------------------------------------------------------------
# Generates the TOC for the document.
#
# $1: [string] - Array containing each line of the document
# returns: table of contents, title, document with ids added to the headings
# 
# Since the '<ol>' element can't handle proper subsection numbering without
# help from CSS, and we want to support text-only browsers as much as we can,
# we need to generate the TOC already with the section numbering. This can be
# done by deriving the outline from the order of appearance of the heading
# elements which must conform with the HTML specification. Which is:
# 
# A HTML document can't have it's headings nested more than a single step at a
# time, but when moving out of the tree, it's jump can be of any lenght up to
# heading 1. So, as we move through the headings, we compare each level with
# previous one, while keeping track of the subsections numbering inside an
# array. And, since all headings are children from the '<h1>' element (which
# also usually — and in my documents, always — serves as the title), we skip
# adding it to the TOC.
# 
# Also, so that the TOC can reference the sections, we add IDs to the headings
# and return them as a modified array.
#-------------------------------------------------------------------------------
sub gentoc {
	my $file = shift;
	my $md = shift;
	my @lines = @_;
	
	local *err_msg = sub { return "$file: Improper heading at line @_"; };
	
	my @counters;
	my $title;
	my $toc;
	my @md_idxs;
	my $md_title_idx;
	my $last_level = 0;
	
	my $heading_regex = '\s*<h(\d)>(.*)<\/h[\d]';
	if ($md) {
		$heading_regex = '\s*(#+)\s(.*)' 
	}

	# Assemble the outline.
	# TODO: Catch unclosed heading tags
	for my $i (0..$#lines) {
		# Skip lines without headings
		next unless $lines[$i] =~ /$heading_regex/;

		my $level = $1;
		my $heading = $2;

		# Count the '#' if parsing markdown
		$level = ($level =~ tr/#//) if $md;
		
		die err_msg($i) if $level > 6;
		
		# Don't add the <h1> heading to the TOC, but set it as the title
		if ($last_level == 0) {
			if ($level == 1) {
				$md_title_idx = $i if $md;
				$title = $heading;
				$last_level = 1;
				next;
			} else {
				die err_msg($i);
			}
		}
		
		# The id is a modified heading
		my $id = $heading =~ s/'|"//gr =~ s/\s/-/gr;
		# Reassemble the line with the id

		if ($md) {
			push(@md_idxs, $i);
		} else {
			$lines[$i] = "<h$level id=$id>$heading</h$level>";
		}

		die err_msg($i) if $level == 1;

		# Assemble the TOC as we travers through the headings
		# Later, if needed, we convert it to Markdown.
		if ($level > $last_level) { # start subsection
			# Nesting in steps larger than one is disallowed in the HTML spec
			die err_msg($i) if $level - $last_level != 1;
			push(@counters, 1);
			$toc .= "<ul>\n" unless $md;
		} elsif ($level < $last_level) { # end subsection
			$toc .= "</li>\n";
			# Close sections as we traverse up the tree
			for (1..$last_level - $level) {
				pop(@counters);
				$toc .= "\t" x @counters."</ul></li>\n" unless $md;
			}
			$counters[-1]++ if @counters;
		} else { # same subsection
			$counters[-1]++;
			$toc .= "</li>\n" unless $md;
		}
		
		# Add a item to the list
		my $section = join('.', @counters);
		my $indent = "\t" x ($md ? @counters - 1 : @counters);
		if ($md) {
			# Add a Markdown item
			$toc .= $indent."[$section $heading](#$id)\n";
		} else {
			# Add an HTML item
			$toc .= $indent."<li><a href=#$id>$section $heading</a>";
		}
		
		$last_level = $level;
	}
	# Close remaining sections
	if ($md) {
		# Add anchors with IDs above each heading
		my $shift = 0;
		foreach (@md_idxs) {
			$_ += $shift++;
			$lines[$_] =~ /#+\s(.*)/;
			my $id = $1 =~ s/\s/-/gr;
			splice(@lines, $_, 0, "<a id=\"$id\"></a>\n");
		}

		# Add TOC after the first h1 element
		splice(@lines, $md_title_idx + 1, 0, "\n$toc");
		chomp $toc;
	} else {
		for (0..$#counters) {
			pop(@counters);
			$toc .= "</li>\n";
			$toc .= "\t" x @counters."</ul>";
		}
	}
	die "Missing <h1> heading at file: $file" unless $title;
	return ($toc, $title, join('', @lines));
}

# Assembles the complete page with all the passed data and metadata.
# $1: string - Table of contents
# $2: string - Title of the document
# $3: string - Date the document was last modified
# $4: [string] - Array with document's lines.
sub mkpage {
	my ($toc, $title, $contents, $date) = @_;

	my $toc_indent = "\t" x 5;
	my $content_indent = "\t" x 3;

	# Document may have not TOC
	if ($toc) {
		# Add indent 
		$toc =~ s/^/$toc_indent/mg;
		$toc = "<nav>\n$toc\n</nav>";
	} else {
		$toc = '';
	}

	# Indent the contents
	$contents =~ s/^/$content_indent/mg;

	return <<~"TEMPLATE";
	<!doctype html>
	<html lang="en">
		<head>
			<meta charset="utf-8">
			<meta name="author" content="Henrique F. T. Paone">
			<link rel="stylesheet" href="stylesheet.css">
			<title>$title</title>
		</head>
		<body>
			<header>
				<nav>
				</nav>
			</header>
			<main>
				<header>
					<p>$date</p>
					$toc
				</header>$contents
			</main>
		</body>
	</html>
	TEMPLATE
}

#-------------------------------------------------------------------------------
# Generates the index page
#
# The index page is a site map with all of the written posts organized by themes,
# which are derived from the directory structure of the './posts' folder. The only
# part that is not generate is the beginning of the page, that is, the h1 tag and
# whatever preamble, if any, that should come before the map.
# 
# To generate the main contents, which we call index, we traverse the posts
# folder, building a parent-to-children relational hash map. Then we traverse this
# map recursively with the local 'walk' function to partially assemble the index.
# Then we add the missing ul tags that should surround the li tags.
#-------------------------------------------------------------------------------
sub mkindex {
	my $domain = shift;
	# TODO: Specify the posts directory via command-line flag
	my $posts_dir = 'posts';
	my @index;
	
	# Get all the posts and put them in a directory to children hash
	my %dirs_to_files;
	my $root;
	find({ 
		wanted => sub { 
			my $dir = decode_utf8(basename($File::Find::dir));
			$_ = decode_utf8($_);
			return if $_ eq '.' or $dir eq 'posts';
			$root = $dir unless ($root);
			push(@{$dirs_to_files{$dir}}, $_);
		},
	}, "./$posts_dir");

	# Traverse the dirs_to_files hash, adding the 'h' and 'li' tags
	my $level = 2;
	local *walk = sub {
		my $parent = shift;
		my $once = 1;
		my @children = @{$dirs_to_files{$parent}};
		for (0..$#children) {
			my $child = $children[$_];
			push(@index, "<h$level>".ucfirst($parent)."</h$level>\n"), $once = 0 if ($once);
			if ($dirs_to_files{$child}) {
				$level++;
				walk($child);
				$level--;
			} else { 
				$child =~ s/.post//;
				my $item = $child;
				$item = $child =~ 
					s/.html//r =~ 
					s/_|-/ /gr =~ 
					# capitalize every word
					s/([\w']+)/\u\L$1/gr;
				push(@index, "\t<li><a href=\"$domain/$child\">$item</a></li>\n"); 
			}
		}	
	};
	walk($root);
	
	# Surround the 'li' tags with 'ul' tags
	my @index_with_ul;
	my $li_found = 0;
	foreach (@index) {
		if ($li_found) {
			if (/<h[1-6]>/) {
				push(@index_with_ul, "</ul>\n");
				$li_found = 0;
			}
		} elsif (/<li>/) {
			push(@index_with_ul, "<ul>\n");
			$li_found = 1;
		}
		push(@index_with_ul, $_);
	}
	# Add remaining 'ul' tag if needed
	push(@index_with_ul, "</ul>\n") if $li_found;
	return @index_with_ul;
}

# Parse command line options
my $domain = "https://localhost";
my $port = '';
my $date = '';
my $toc_only = '';
my $md = 0;
GetOptions(
	'd|domain=s' => \$domain,
	'p|port=s' => \$port,
	'D|date=s' => \$date,
	't|toc-only' =>  \$toc_only,
	'm|markdown' => \$md
);

$domain = "$domain:$port" if $port;

# Get the file's contents by opening it or via STDIN
my @lines;
my $file = $ARGV[0];

if ($file) {
	open(my $contents, "<", $file) or die "Could not open $_ $!\n";
	push(@lines, <$contents>);
} elsif (-p STDIN){
	$file = "Reading input file's contents from STDIN";
	push(@lines, $_) foreach (<STDIN>);
} else {
	die 'No input supplied.';
}

@lines = collapse_headings(@lines);

# Check if we are generating the index
my ($is_index, $idx) = (0) x 2;
foreach (@lines) {
	if (/^<index\/>$/) {
		$is_index = 1;
		splice(@lines, $idx, 1);
		last;
	}
	$idx++;
}

push(@lines, "\n", mkindex($domain)) if $is_index;
my ($toc, $title, $contents) = gentoc($file, $md, @lines);

if ($toc_only) {
	print "$toc\n";
	exit 0;
}

print $md ? $contents : mkpage($toc, $title, $contents, $date);