diff --git a/rst_template/bk_main.xml b/rst_template/bk_main.xml index 3ed5502..b792d5b 100644 --- a/rst_template/bk_main.xml +++ b/rst_template/bk_main.xml @@ -27,7 +27,7 @@ diff --git a/rst_template/opf_html2db.py b/rst_template/opf_html2db.py new file mode 100644 index 0000000..79f1c33 --- /dev/null +++ b/rst_template/opf_html2db.py @@ -0,0 +1,692 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# Builds OpenPOWER Foundation documentation using standard template. +# +# Assumes rst2db has been used to convert rst to docbook. +# +import os, sys, getopt, shutil, errno, subprocess, copy, re +from os import fdopen, remove +from shutil import move +from git import Repo +from lxml import etree +from conf import opf_docbook_settings, master_doc, project +from subprocess import Popen, PIPE + + +def copy_xml_to_template(src_dir, tgt_dir): + # Copy XML files + src_files = os.listdir(src_dir) + for filename in src_files: + full_file = os.path.join (src_dir, filename) + if (os.path.isfile(full_file)): + shutil.copy(full_file, tgt_dir) + elif (os.path.isdir(full_file)): + try: + os.makedirs(os.path.join(tgt_dir,filename)) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + copy_xml_to_template( os.path.join(src_dir,filename), os.path.join(tgt_dir,filename) ) + +def update_file(filename, old_str, new_str): + # Verify tag exists + with open(filename) as f: + s = f.read() + if old_str not in s: + print 'Error: "{old_str}" not found in {filename}.'.format(**locals()) + sys.exit(-2) + + # Safely write the changed content, if found in the file + with open(filename, 'w') as f: + s = s.replace(old_str, new_str) + f.write(s) + +def traverse_clean_html_source_examples(filename): + temp_file = filename + '.tmp' + code_found = False + html_source_start_regex = '^
'
+    html_source_stop_regex = '^
' + span_regex = '\' + + print filename + + # Walk file by line + with open(temp_file, 'w') as new_file: + with open(filename) as old_file: + for line in old_file: + if re.match(html_source_start_regex,line): + # print 'DEBUG: Code block start found' + code_found = True + elif re.match(html_source_stop_regex,line): + # print 'DEBUG: Code block stop found' + code_found = False + + if code_found: + oldline = line + # Remove references + line = line.replace('', '') + # Remove references + line = re.sub(span_regex, '', line) + # print 'DEBUG: line changed.\n Old: >' + oldline + '<\n New: >' + line + '<' + new_file.write(line) + + # Preserve old file + move(filename, filename + '.bak') + + # Move new file into old + move(temp_file, filename) + +def traverse_clean_html_nodes(element): + + if 'ul' in element.tag and element.attrib: + key = element.attrib.keys()[0] + value = element.attrib[key] + if 'id' in key: + first_child = element.__getitem__(0); + if first_child.__len__() == 0: + print 'Error: Bad assumption.
    tag is empty.' + + # Add attribute to first_child and remove from element + first_child.attrib[ key ] = value; + del element.attrib[ key ] + + # print 'DEBUG:
      attributes: ', element.attrib + # print 'DEBUG: child attributes: ', first_child.attrib + sys.stderr.write( '**Information: id attribute on
        tag to first sub-element, <' + element.tag + '> for ' + key + ' = ' + value + '\n' ) + + for child in element.getchildren(): + traverse_clean_html_nodes(child) + +def cleanup_html(infile, outfile): + + # Create internal representation of document from infile + parser = etree.XMLParser(remove_comments=False) + tree = etree.parse(infile, parser=parser) + head = tree.getroot() + + # print_tree( head, 0, 2 ) + + # Walk nodes doing any cleanup + traverse_clean_html_nodes(head) + + # Persist updates to output file + tree.write(outfile) + + # Note: This invocation needs to occur post tree-write because + # it will update file + traverse_clean_html_source_examples(outfile) + +def find_match(reference, anchor_node, relationship): + + if not anchor_node is None and 'anchor' in anchor_node.tag: + # Try this, verify matching ids + key = anchor_node.attrib.keys()[0] + value = anchor_node.attrib[key] + regex = '^' + reference + '(\.\d+)?$' + + # print 'DEBUG: ' + relationship + ' anchor check. Reference: ' + reference + ' Regex: ' + regex + ' Value: ' + value + + if re.match(regex,value): + return anchor_node + + else: + # print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...' + node = anchor_node + while not node.getprevious() is None: + node = node.getprevious() + if 'anchor' in node.tag: + key = node.attrib.keys()[0] + value = node.attrib[ key ] + if re.match(regex,value): + # print 'DEBUG: Anchor in ' + relationship + ' tag finally match!!!' + return node + # else + # print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...' + + else: + # print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Anchor node: ', node + return None + + else: + # print 'Error: find_match called with non-anchor element. Reference: ' + reference + ' Node: ' + anchor_node + ' Relationship: ' + relationship + return None + +def traverse_clean_links(element): + + if 'link' in element.tag: + # Note: Terminal tag, no need to recurse + + # Gather link details + text = element.text + num_attributes = element.attrib.__len__() + reference = element.attrib.get('linkend',None) + + if num_attributes is 1 and not reference is None and text == u'¶': + # Erroneous link message, find related anchor, could be "uncle" or "cousin" (of various degrees) + anchor = None + parent = element.getparent() + grandparent = parent.getparent() + greatuncle = grandparent.getprevious() + + # Check Great Uncle for match + anchor = find_match(reference, greatuncle, 'Great Uncle') + + # If no match, locate "cousin" and if found, check it + if anchor is None: + cousin = None + if not greatuncle is None: + node = greatuncle + while node.__len__() > 0 and cousin is None: + node = node.__getitem__(node.__len__() -1) + if 'anchor' in node.tag: + cousin = node + + if not cousin is None: + anchor = find_match(reference, cousin, 'Cousin') + + # If no match, try uncle + if anchor is None: + uncle = parent.getprevious() + anchor = find_match(reference, uncle, 'Uncle') + + # Always delete tag of this type (contains only u'¶' for text) + parent.__delitem__(parent.index(element)) + + if not anchor is None: + # print 'MATCH FOUND: ', reference + + # Retrieve attribute key from anchor + # Note: The key is always correctly set by herold in the case of duplicate keys. + # The tag may have a "dot" and a number appended to value in . + key = anchor.attrib.keys()[0] + value = anchor.get(key) + if 'title' in parent.tag: + # Add id attribute to Grandparent + grandparent.set(key,value) + else: + # Add id attribute to Parent + parent.set(key,value) + + sys.stderr.write( '**Information: removed dummy link and for ' + reference + ' and added proper xml:id as ' + value + '\n' ) + + # Delete tag + anchor_parent = anchor.getparent() + anchor_parent.__delitem__(anchor_parent.index(anchor)) + else: + # Nothing more to do + sys.stderr.write( '**Information: Matching element not found for reference = ' + reference + '. Link removed.' + '\n' ) + + + else: + for child in element.getchildren(): + traverse_clean_links(child) + +def traverse_clean_other(element): + if 'informalexample' in element.tag: + # Get key elements around this one + parent = element.getparent() + grandparent = parent.getparent() + + # Create new elements -- section and title (use text from informal example element) + new_section = parent.makeelement(grandparent.tag) + new_title = parent.makeelement('title') + title = element.text + new_title.text = title + + # Add title to new section + new_section.append(new_title) + + # Copy over children from to new
        + for child in element.getchildren(): + element.remove(child) + new_section.append(child) + + # print 'DEBUG: old tree...' + # print_tree(parent, 0, 2) + + # Add new
        as next sibling of parent and remove from parent + parent.addnext(new_section) + parent.remove(element) + + # print 'DEBUG: new tree...' + # print_tree(parent.getparent(), 0, 3) + + sys.stderr.write( '**Information: ' + element.text + ' removed and promoted as
        with title: ' + title + '\n' ) + + elif 'note' in element.tag: + # Get key elements around this one + parent = element.getparent() + grandparent = parent.getparent() + + # print 'DEBUG: old tree...' + # print_tree(parent, 0, 4) + + # Create new elements -- section and title (use text from bridgehead subelement) + new_section = parent.makeelement(parent.tag) + bridgehead = element.__getitem__(0).__getitem__(0) + + if not 'bridgehead' in bridgehead.tag: + print 'Error: Bad assumption about structure. Bridgehead not found as expected.' + sys.exit(-20) + + title = bridgehead.text + new_title = parent.makeelement('title') + new_title.text = title + + # Add title to new section + new_section.append(new_title) + + # Remove from + bridgehead.getparent().remove(bridgehead) + + # Copy over remaining items in to new
        + for child in element.getchildren(): + element.remove(child) + new_section.append(child) + + # Add new
        as next sibling of parent and remove from parent + parent.addnext(new_section) + parent.remove(element) + + # print 'DEBUG: New tree...' + # print_tree(grandparent, 0, 3) + + sys.stderr.write( '**Information: removed and promoted as
        with title: ' + title + '\n' ) + + elif 'anchor' in element.tag: + # Get key elements around this one + parent = element.getparent() + + # Retrieve anchor details + key = element.attrib.keys()[0] + value = element.attrib[ key ] + + # Remove node + parent.remove( element ); + + sys.stderr.write( '**Information: removed with id: ' + value + '\n' ) + + elif 'section' in element.tag: + #Ensure at least one child beyond + if element.__len__() == 1: + title = element.__getitem__(0).text + parent = element.getparent() + + # Make and add empty paragraph to section, just behind title + new_para = parent.makeelement('para') + new_para.text = ' ' + element.append(new_para) + + sys.stderr.write( '**Information: <para> tag added to empty section with title: ' + title + '\n' ) + + for child in element.getchildren(): + traverse_clean_other(child) + +def cleanup_xml(infile, outfile): + # Create internal representation of document from infile + parser = etree.XMLParser(remove_comments=False) + tree = etree.parse(infile, parser=parser) + head = tree.getroot() + + # print_tree( head, 0, 2 ) + + # Note: because link cleanup involves relative location of multiple tags, it must be separate and first + traverse_clean_links(head) + traverse_clean_other(head) + + # Persist updates to output file + tree.write(outfile) + +def print_tree(element, level, max_depth): + # Print current element + num_children = element.__len__() + indent = ' '.ljust(level+1) + + if level < max_depth: + print indent, 'Tag: ', element.tag, ' Attrib: ', element.attrib, ' Text: >', element.text, '< Num children: ', num_children + + for i in range(num_children): + child = element.__getitem__(i) + print_tree(child, level+1, max_depth) + +def traverse_clean_sections(element): + section_blacklist = ['Navigation', 'Table Of Contents'] + + # Walk children looking for next set of <section> tags, opening include files if necessary + num_children = element.__len__() + i = 0; + while i < num_children: + child = element.__getitem__(i) + parent = element + + # print 'DEBUG: clean sections, visiting node with tag: ' + child.tag + + # Walk first level of tags, deleting info and any "blacklist" sections + if 'section' in child.tag: + num_sec_children = child.__len__() + + title = '' + if num_sec_children > 0: + first_grandchild = child.__getitem__(0) + if first_grandchild.__len__() == 0: + title = child.__getitem__(0).text + else: + # This makes me nervous, not sure how well it will work... + title = first_grandchild.__getitem__(0).text + # print 'Section title found: ' + title + + if title in section_blacklist: + # Delete section + # print 'DEBUG: Deleted blacklist section ' + title + parent.remove(child) + num_children = num_children-1 + else: + traverse_clean_sections(child) + i = i+1 + else: + i=i+1 + +def eliminate_top_section(head): + + # Remove <info> and <index> sections + for child in head.getchildren(): + if 'info' in child.tag or 'index' in child.tag: + # print 'DEBUG: unneeded top level tag: ' + child.tag + head.remove(child) + + # Eliminate head section which really is title + if head.__len__() == 1: + first_section = head.__getitem__(0) + + if not 'section' in first_section.tag: + print 'Error: Bad assumption. Top tag in document is not a section.' + sys.exit(-36) + + # print 'DEBUG: first section -- tag: ' + first_section.tag + ' num children: ' + str(first_section.__len__()) + + for child in first_section.getchildren(): + # print 'DEBUG: child -- tag: ' + child.tag + ' num children: ' + str(child.__len__()) + + # Promote sections + if 'section' in child.tag: + first_section.remove(child); + head.append(child); + # print 'DEBUG: Promoting child -- tag: ' + child.tag + + head.remove(first_section) + + else: + print 'Error: Bad assumption. Too many sections (' + str(head.__len__()) + ') found in base document.' + sys.exit(-13) + + +def transform_head_sections(head): + + num_chapter = 0 + + for child in head.getchildren(): + if 'section' in child.tag: + child.tag = child.tag.replace('section','chapter') + num_chapter = num_chapter+1 + + if num_chapter == 0: + print 'Error: No chapters found in document' + sys.exit(-6) + + +def convert_structure(infile, outfile): + + # Create internal representation of document from infile + parser = etree.XMLParser(remove_comments=False) + tree = etree.parse(infile, parser=parser) + head = tree.getroot() + + # print 'DEBUG: Pre tree structure cleanup...' + # print_tree(head, 0, 3) + + if 'article' in head.tag: + head.tag = 'book' + + # Clear attributes + for attrib in head.attrib.keys(): + head.attrib.pop(attrib, None) + if head.attrib.items() != []: + print 'Error: Section attributes not removed. ', head.attrib.items(), ' items remain -- ', head.attrib.keys() + sys.exit(-5) + else: + print 'Toc file contains ', head.tag, 'tag, not <article>' + sys.exit(-4) + + # Traverse tree sections, removing nodes as needed + traverse_clean_sections(head) + + # Eliminate first section, placeholder for document title + eliminate_top_section(head) + + # Traverse remaining top level <section> and convert to <chapter> + transform_head_sections(head) + + # print 'DEBUG: Post tree structure cleanup...' + # print_tree(head, 0, 2) + + # Persist updates to output file + tree.write(outfile) + + +def remove_book_tags(old_file, new_file): + with open(old_file, 'r') as input: + with open(new_file, 'wb') as output: + for line in input: + if '<book' not in line and '</book>' not in line: + output.write(line) + +def insert_toc_into_book(toc_file, book_file): + book_file_bak = book_file+'.bak' + shutil.copy2(book_file, book_file_bak) + key_string = '<!--TBD-->' + inserted_toc = False + + with open(book_file_bak, 'r') as input: + with open(book_file, 'wb') as output: + for line in input: + if key_string not in line: + output.write(line) + else: + inserted_toc = True + # Write toc_file contents + with open(toc_file, 'r') as input_toc: + for line_toc in input_toc: + output.write(line_toc) + + if not inserted_toc: + print 'Error: key string of "', key_string, '" not found in ', book_file + sys.exit(-7) + +def build_revhistory(book_file): + # Variables for formating git log + log_format = '%h%x01%an%x01%ad%x01%s%x02' + log_fields = ['id', 'author', 'date', 'subject'] + + # Retrieve log + pipe = Popen('git log --date=iso --format="%s" -- . .' % log_format, shell=True, stdout=PIPE) + log, _ = pipe.communicate() + + # Substitute for problem characters: &, <, > + log = log.replace('&','&').replace('<','<').replace('>','>') + + # Remove newlines, trailing end-of-record (0x02), and then split at end-of-record + log = log.replace('\n','').strip('\x02').split('\x02') + + # Split records into individual fields + log = [row.split('\01') for row in log] + + # Create dictionary using field names + log = [dict(zip(log_fields, row)) for row in log] + + # Format log into revision history + revision = '<revhistory>\n' + for entry in log: + revision = revision + '<revision><date>' + entry['date'].split(' ')[0] + '</date><revdescription><para>' +\ + entry['subject'] + ' (' + entry['id'] + ')</para></revdescription></revision>\n' + revision = revision + '</revhistory>\n' + + # Update file + rev_str = '<revhistory>TBD</revhistory>' + update_file(book_file, rev_str, revision) + + +def main(argv): + master_git_url = 'https://github.com/OpenPOWERFoundation/Docs-Master.git' + template_git_url = 'https://github.com/OpenPOWERFoundation/Docs-Template.git' + html_dir = '' + build_dir = '' + db_dir = '' + master_dir = '' + template_dir = '' + toc_file = master_doc+'.xml' + + try: + opts, args = getopt.getopt(argv,"hs:b:d:m:t:",["htmldir","builddir=","docbookdir=","masterdir=","templatedir="]) + except getopt.GetoptError: + print 'Invalid option specified. Usage:' + print ' opf_html2db.py -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>' + sys.exit(-1) + for opt, arg in opts: + if opt == '-h': + print 'opf_hmtl2db.py -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>' + sys.exit(0) + elif opt in ("-s", "--htmldir"): + html_dir = arg + elif opt in ("-b", "--builddir"): + build_dir = arg + elif opt in ("-d", "--docbookdir"): + db_dir = arg + elif opt in ("-m", "--masterdir"): + master_dir = arg + elif opt in ("-t", "--templatedir"): + template_dir = arg + + # Verify html directory, error if not found + if not os.path.exists(html_dir): + print 'ERROR: ' + html_dir + ' does not exist. Please specify path to directory containing single html file.' + sys.exit(-11) + + # Generate path to single file + # NOTE: assumption is that file name is always "index.html" (master_doc). If this doesn't prove true, may need to use variable. + html_file_src = os.path.join(html_dir, master_doc + '.html') + + if not os.path.isfile(html_file_src): + print 'ERROR: ' + html_file_src + ' does not exist. Please verify path to single html file and file name.' + sys.exit(-12) + + # Convert html file to xml and place in db directory + if not os.path.exists(db_dir): + print 'Making docbook build directory ' + db_dir + os.path.makedirs(db_dir) + + db_file = os.path.join(db_dir, project + '.xml') + if os.path.exists(db_file): + os.remove(db_file) + + # Clean up herold html output + print 'Cleaning up html file before processing' + html_file = os.path.join(db_dir, master_doc + '.html') + html_file_tmp1 = html_file + '.tmp1' + shutil.copy2(html_file_src, html_file) + cleanup_html(html_file, html_file_tmp1) + + print 'Converting html file to XML...' + print subprocess.check_output(['herold', '-i', html_file_tmp1, '-o', db_file]) + + # Clone a new Master Directory + print 'Cloning new Docs-Master directory...' + if os.path.exists(master_dir): + shutil.rmtree(master_dir) + Repo.clone_from(master_git_url, master_dir) + + # Clone a new Template Directory + print 'Cloning new Docs-Template directory...' + if os.path.exists(template_dir): + shutil.rmtree(template_dir) + Repo.clone_from(template_git_url, template_dir) + + # Create the new XML file ***** + rst_template_dir = os.path.join(template_dir, 'rst_template') + full_toc_file = os.path.join(rst_template_dir, toc_file) + shutil.copy2(db_file, full_toc_file) + book_file = os.path.join(rst_template_dir, 'bk_main.xml') + + # Update all file in opf_docbook_settings with tag/value combinations specified + print 'Updating Docbook files with settings from conf.py...' + for f in opf_docbook_settings.keys(): + filename = os.path.join(rst_template_dir, f) + tags = opf_docbook_settings[f] + + for tag in tags: + value = opf_docbook_settings[f][tag] + + if value != '': + new_str = '<'+tag+'>'+value+'</'+tag+'>' + else: + new_str = '' + + old_str = '<'+tag+'>TBD</'+tag+'>' + update_file(filename, old_str, new_str) + + # Parse TOC file, convert high level tag to "book" and write back out to .tmp1 file + print 'Cleaning up Docbook file structure...' + full_toc_file_tmp1 = full_toc_file+'.tmp1' + full_toc_file_tmp2 = full_toc_file+'.tmp2' + full_toc_file_tmp3 = full_toc_file+'.tmp3' + + # Walk document correcting XML errors + cleanup_xml( full_toc_file, full_toc_file_tmp1 ) + + # Remove extraneous sections + convert_structure( full_toc_file_tmp1, full_toc_file_tmp2 ) + + # Eliminate <book> and <title> tags in .tmp1 and write to .tmp2 file + remove_book_tags(full_toc_file_tmp2, full_toc_file_tmp3) + + # Update link to first file + insert_toc_into_book(full_toc_file_tmp3, book_file) + + # Create revision history from Git Log + print 'Building document revision history from git log...' + build_revhistory(book_file) + + # TODO: Remove this hack after rst_template bk_main gets updated + update_file(book_file, 'xmlns:xlink', 'xmlns:xl') + + # Perform build of Docbook + print 'Building Docbook PDF and HTML output in Maven...' + maven_log_file = 'build.log' + maven_build = 'cd ' + rst_template_dir + '; mvn generate-sources 2>&1 | tee ' + maven_log_file + '' + pipe = Popen(maven_build, shell=True) + log, err = pipe.communicate() + + if pipe.returncode != 0: + print "Build failed with return code:%s" % pipe.returncode + print "See %s/build.log for more details" & rst_template_dir + + # Copy output to better location + print 'Copying build output...' + bld_out_dir = os.path.join(rst_template_dir, 'target/docbkx/webhelp') + html_head = os.path.join(bld_out_dir, opf_docbook_settings['pom.xml']['webhelpDirname'] + '/index.html') + if os.path.exists(bld_out_dir) and os.path.exists(html_head): + doc_dir = os.path.join(build_dir, 'docbook/opf_docbook') + + if os.path.exists(doc_dir): + shutil.rmtree(doc_dir) + shutil.copytree(bld_out_dir, doc_dir) + print "Build successful. Output files located in %s" % os.path.join(doc_dir, opf_docbook_settings['pom.xml']['webhelpDirname']) + + sys.exit(0) + + else: + print "Docbook build failed. Check logfile %s for details." % os.path.join(rst_template_dir, maven_log_file) + sys.exit(-10) + +if __name__ == "__main__": + main(sys.argv[1:])