#!/usr/bin/env python # -*- coding: utf-8 -*- import re import pandoc class TextFile: ''' Basic text file class ''' def __init__(self, filename, source): self.filename = filename self.source = source @property def lines(self): ''' Returns list of lines from source. Leading and trailing blank lines are removed. ''' return self.source.strip().split('\n') @property def firstline(self): return self.lines[0] class TOCItem: ''' Table of Contents Line Item ''' def __init__(self, heading, level): self.heading = heading self.level = level @property def header_id(self): # - Remove all formatting, links, etc. # - Remove all punctuation, except underscores, hyphens, and periods. # - Replace all spaces and newlines with hyphens. # - Convert all alphabetic characters to lowercase. # - Remove everything up to the first letter (identifiers may not begin # with a number or punctuation mark). remove = re.compile('^[^a-zA-Z]|[+~!@#$%^&*\(\){}\[\];:"\',<>?/\`]') header_id = self.heading.lower().replace('\n', ' ').replace(' ', '-') return remove.sub('', header_id) class Page(TextFile): ''' Represents single page source file. ''' def __init__(self, filename, source, format='markdown'): TextFile.__init__(self, filename, source) self.doc = pandoc.Document() self.doc._input(self.source, format=format) @property def markdown(self): return self.doc.markdown @property def title(self): ''' Returns a tuple containing a string representing the page title and an integer representing the level in the document outline hierarchy. The level is preserved so that page titles can be properly positioned (indented) in the table of contents. If the page has a pandoc title block, the title is retreived from there and returned with a level of 1 If a title block is not found, the first heading is returned with the corrisponding heading level. Finally, if no title block or headings are found in the page, the filename is returned as the title with underscores changed to spaces. ''' if self.lines[0].startswith('%'): title = (self.lines[0].split(' ', 1)[1].strip(), 1) elif self.toc: title = self.toc[0] else: title = (self.filename.replace('_', ' '), 1) return title @property def toc(self): ''' Returns a list of TOCItems of the page. ''' toc = [] heading = '' for line in self.lines: if heading and re.match('[=]{2}', line): toc.append(TOCItem(heading, 1)) continue if heading and re.match('[-]{2}', line): toc.append(TOCItem(heading, 2)) continue if re.match('#+.+[A-z|0-9]', line): level = len(re.match('#+', line).group()) toc.append(TOCItem(line.split(' ', 1)[1].strip(), level)) continue heading = line.strip() return toc @property def htmlfile(self): return '.'.join(self.filename.split('.')[:-1]) + '.html'