Second pass at supporting RST->DB conversions:

- Minor update to rst_template/bk_main.xml to use namespace reference
  "xl" instead of "xlink" to be compatible with herold output
- Create new opf_html2db.py script to convert singlehtml output of projects
  using herold and then massage into OPF format for inclusion in rst_template
  format

Signed-off-by: Jeff Scheel <scheel@us.ibm.com>
master
Jeff Scheel 7 years ago
parent 51954d8265
commit 0751965d7f

@ -27,7 +27,7 @@

<book xmlns="http://docbook.org/ns/docbook"
xmlns:xi="http://www.w3.org/2001/XInclude"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:xl="http://www.w3.org/1999/xlink"
version="5.0"
xml:id="bk_main">


@ -0,0 +1,692 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Builds OpenPOWER Foundation documentation using standard template.
#
# Assumes rst2db has been used to convert rst to docbook.
#
import os, sys, getopt, shutil, errno, subprocess, copy, re
from os import fdopen, remove
from shutil import move
from git import Repo
from lxml import etree
from conf import opf_docbook_settings, master_doc, project
from subprocess import Popen, PIPE

def copy_xml_to_template(src_dir, tgt_dir):
# Copy XML files
src_files = os.listdir(src_dir)
for filename in src_files:
full_file = os.path.join (src_dir, filename)
if (os.path.isfile(full_file)):
shutil.copy(full_file, tgt_dir)
elif (os.path.isdir(full_file)):
try:
os.makedirs(os.path.join(tgt_dir,filename))
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
copy_xml_to_template( os.path.join(src_dir,filename), os.path.join(tgt_dir,filename) )

def update_file(filename, old_str, new_str):
# Verify tag exists
with open(filename) as f:
s = f.read()
if old_str not in s:
print 'Error: "{old_str}" not found in {filename}.'.format(**locals())
sys.exit(-2)

# Safely write the changed content, if found in the file
with open(filename, 'w') as f:
s = s.replace(old_str, new_str)
f.write(s)

def traverse_clean_html_source_examples(filename):
temp_file = filename + '.tmp'
code_found = False
html_source_start_regex = '^<div class="highlight-default"><div class="highlight"><pre>'
html_source_stop_regex = '^</pre></div>'
span_regex = '\<span(\sclass="[a-z]+")?>'

print filename
# Walk file by line
with open(temp_file, 'w') as new_file:
with open(filename) as old_file:
for line in old_file:
if re.match(html_source_start_regex,line):
# print 'DEBUG: Code block start found'
code_found = True
elif re.match(html_source_stop_regex,line):
# print 'DEBUG: Code block stop found'
code_found = False

if code_found:
oldline = line
# Remove </span> references
line = line.replace('</span>', '')
# Remove <span class=...> references
line = re.sub(span_regex, '', line)
# print 'DEBUG: line changed.\n Old: >' + oldline + '<\n New: >' + line + '<'
new_file.write(line)

# Preserve old file
move(filename, filename + '.bak')
# Move new file into old
move(temp_file, filename)

def traverse_clean_html_nodes(element):

if 'ul' in element.tag and element.attrib:
key = element.attrib.keys()[0]
value = element.attrib[key]
if 'id' in key:
first_child = element.__getitem__(0);
if first_child.__len__() == 0:
print 'Error: Bad assumption. <ul> tag is empty.'
# Add attribute to first_child and remove from element
first_child.attrib[ key ] = value;
del element.attrib[ key ]
# print 'DEBUG: <ul> attributes: ', element.attrib
# print 'DEBUG: child attributes: ', first_child.attrib
sys.stderr.write( '**Information: id attribute on <ul> tag to first sub-element, <' + element.tag + '> for ' + key + ' = ' + value + '\n' )
for child in element.getchildren():
traverse_clean_html_nodes(child)

def cleanup_html(infile, outfile):
# Create internal representation of document from infile
parser = etree.XMLParser(remove_comments=False)
tree = etree.parse(infile, parser=parser)
head = tree.getroot()

# print_tree( head, 0, 2 )

# Walk nodes doing any cleanup
traverse_clean_html_nodes(head)

# Persist updates to output file
tree.write(outfile)
# Note: This invocation needs to occur post tree-write because
# it will update file
traverse_clean_html_source_examples(outfile)
def find_match(reference, anchor_node, relationship):

if not anchor_node is None and 'anchor' in anchor_node.tag:
# Try this, verify matching ids
key = anchor_node.attrib.keys()[0]
value = anchor_node.attrib[key]
regex = '^' + reference + '(\.\d+)?$'

# print 'DEBUG: ' + relationship + ' anchor check. Reference: ' + reference + ' Regex: ' + regex + ' Value: ' + value

if re.match(regex,value):
return anchor_node

else:
# print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...'
node = anchor_node
while not node.getprevious() is None:
node = node.getprevious()
if 'anchor' in node.tag:
key = node.attrib.keys()[0]
value = node.attrib[ key ]
if re.match(regex,value):
# print 'DEBUG: Anchor in ' + relationship + ' tag finally match!!!'
return node
# else
# print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...'

else:
# print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Anchor node: ', node
return None

else:
# print 'Error: find_match called with non-anchor element. Reference: ' + reference + ' Node: ' + anchor_node + ' Relationship: ' + relationship
return None

def traverse_clean_links(element):

if 'link' in element.tag:
# Note: Terminal tag, no need to recurse
# Gather link details
text = element.text
num_attributes = element.attrib.__len__()
reference = element.attrib.get('linkend',None)
if num_attributes is 1 and not reference is None and text == u'¶':
# Erroneous link message, find related anchor, could be "uncle" or "cousin" (of various degrees)
anchor = None
parent = element.getparent()
grandparent = parent.getparent()
greatuncle = grandparent.getprevious()
# Check Great Uncle for match
anchor = find_match(reference, greatuncle, 'Great Uncle')
# If no match, locate "cousin" and if found, check it
if anchor is None:
cousin = None
if not greatuncle is None:
node = greatuncle
while node.__len__() > 0 and cousin is None:
node = node.__getitem__(node.__len__() -1)
if 'anchor' in node.tag:
cousin = node
if not cousin is None:
anchor = find_match(reference, cousin, 'Cousin')
# If no match, try uncle
if anchor is None:
uncle = parent.getprevious()
anchor = find_match(reference, uncle, 'Uncle')
# Always delete <link> tag of this type (contains only u'¶' for text)
parent.__delitem__(parent.index(element))
if not anchor is None:
# print 'MATCH FOUND: ', reference

# Retrieve attribute key from anchor
# Note: The <link> key is always correctly set by herold in the case of duplicate keys.
# The <anchor> tag may have a "dot" and a number appended to value in <link>.
key = anchor.attrib.keys()[0]
value = anchor.get(key)
if 'title' in parent.tag:
# Add id attribute to Grandparent
grandparent.set(key,value)
else:
# Add id attribute to Parent
parent.set(key,value)
sys.stderr.write( '**Information: removed dummy link and for ' + reference + ' and added proper xml:id as ' + value + '\n' )
# Delete <anchor> tag
anchor_parent = anchor.getparent()
anchor_parent.__delitem__(anchor_parent.index(anchor))
else:
# Nothing more to do
sys.stderr.write( '**Information: Matching <anchor> element not found for reference = ' + reference + '. Link removed.' + '\n' )
else:
for child in element.getchildren():
traverse_clean_links(child)

def traverse_clean_other(element):
if 'informalexample' in element.tag:
# Get key elements around this one
parent = element.getparent()
grandparent = parent.getparent()

# Create new elements -- section and title (use text from informal example element)
new_section = parent.makeelement(grandparent.tag)
new_title = parent.makeelement('title')
title = element.text
new_title.text = title

# Add title to new section
new_section.append(new_title)

# Copy over children from <informalexample> to new <section>
for child in element.getchildren():
element.remove(child)
new_section.append(child)

# print 'DEBUG: old tree...'
# print_tree(parent, 0, 2)

# Add new <section> as next sibling of parent and remove <informalexample> from parent
parent.addnext(new_section)
parent.remove(element)

# print 'DEBUG: new tree...'
# print_tree(parent.getparent(), 0, 3)
sys.stderr.write( '**Information: <informalexample> ' + element.text + ' removed and promoted as <section> with title: ' + title + '\n' )

elif 'note' in element.tag:
# Get key elements around this one
parent = element.getparent()
grandparent = parent.getparent()

# print 'DEBUG: old tree...'
# print_tree(parent, 0, 4)

# Create new elements -- section and title (use text from bridgehead subelement)
new_section = parent.makeelement(parent.tag)
bridgehead = element.__getitem__(0).__getitem__(0)
if not 'bridgehead' in bridgehead.tag:
print 'Error: Bad assumption about <note> structure. Bridgehead not found as expected.'
sys.exit(-20)
title = bridgehead.text
new_title = parent.makeelement('title')
new_title.text = title

# Add title to new section
new_section.append(new_title)
# Remove <bridgehead> from <note>
bridgehead.getparent().remove(bridgehead)
# Copy over remaining items in <note> to new <section>
for child in element.getchildren():
element.remove(child)
new_section.append(child)
# Add new <section> as next sibling of parent and remove <note> from parent
parent.addnext(new_section)
parent.remove(element)

# print 'DEBUG: New tree...'
# print_tree(grandparent, 0, 3)
sys.stderr.write( '**Information: <note> removed and promoted as <section> with title: ' + title + '\n' )

elif 'anchor' in element.tag:
# Get key elements around this one
parent = element.getparent()

# Retrieve anchor details
key = element.attrib.keys()[0]
value = element.attrib[ key ]

# Remove node
parent.remove( element );

sys.stderr.write( '**Information: removed <anchor> with id: ' + value + '\n' )
elif 'section' in element.tag:
#Ensure at least one child beyond <title>
if element.__len__() == 1:
title = element.__getitem__(0).text
parent = element.getparent()

# Make and add empty paragraph to section, just behind title
new_para = parent.makeelement('para')
new_para.text = '&nbsp;'
element.append(new_para)
sys.stderr.write( '**Information: <para> tag added to empty section with title: ' + title + '\n' )

for child in element.getchildren():
traverse_clean_other(child)

def cleanup_xml(infile, outfile):
# Create internal representation of document from infile
parser = etree.XMLParser(remove_comments=False)
tree = etree.parse(infile, parser=parser)
head = tree.getroot()

# print_tree( head, 0, 2 )

# Note: because link cleanup involves relative location of multiple tags, it must be separate and first
traverse_clean_links(head)
traverse_clean_other(head)

# Persist updates to output file
tree.write(outfile)
def print_tree(element, level, max_depth):
# Print current element
num_children = element.__len__()
indent = ' '.ljust(level+1)
if level < max_depth:
print indent, 'Tag: ', element.tag, ' Attrib: ', element.attrib, ' Text: >', element.text, '< Num children: ', num_children
for i in range(num_children):
child = element.__getitem__(i)
print_tree(child, level+1, max_depth)

def traverse_clean_sections(element):
section_blacklist = ['Navigation', 'Table Of Contents']

# Walk children looking for next set of <section> tags, opening include files if necessary
num_children = element.__len__()
i = 0;
while i < num_children:
child = element.__getitem__(i)
parent = element
# print 'DEBUG: clean sections, visiting node with tag: ' + child.tag
# Walk first level of tags, deleting info and any "blacklist" sections
if 'section' in child.tag:
num_sec_children = child.__len__()
title = ''
if num_sec_children > 0:
first_grandchild = child.__getitem__(0)
if first_grandchild.__len__() == 0:
title = child.__getitem__(0).text
else:
# This makes me nervous, not sure how well it will work...
title = first_grandchild.__getitem__(0).text
# print 'Section title found: ' + title
if title in section_blacklist:
# Delete section
# print 'DEBUG: Deleted blacklist section ' + title
parent.remove(child)
num_children = num_children-1
else:
traverse_clean_sections(child)
i = i+1
else:
i=i+1
def eliminate_top_section(head):

# Remove <info> and <index> sections
for child in head.getchildren():
if 'info' in child.tag or 'index' in child.tag:
# print 'DEBUG: unneeded top level tag: ' + child.tag
head.remove(child)
# Eliminate head section which really is title
if head.__len__() == 1:
first_section = head.__getitem__(0)
if not 'section' in first_section.tag:
print 'Error: Bad assumption. Top tag in document is not a section.'
sys.exit(-36)
# print 'DEBUG: first section -- tag: ' + first_section.tag + ' num children: ' + str(first_section.__len__())
for child in first_section.getchildren():
# print 'DEBUG: child -- tag: ' + child.tag + ' num children: ' + str(child.__len__())
# Promote sections
if 'section' in child.tag:
first_section.remove(child);
head.append(child);
# print 'DEBUG: Promoting child -- tag: ' + child.tag
head.remove(first_section)

else:
print 'Error: Bad assumption. Too many sections (' + str(head.__len__()) + ') found in base document.'
sys.exit(-13)


def transform_head_sections(head):

num_chapter = 0
for child in head.getchildren():
if 'section' in child.tag:
child.tag = child.tag.replace('section','chapter')
num_chapter = num_chapter+1

if num_chapter == 0:
print 'Error: No chapters found in document'
sys.exit(-6)


def convert_structure(infile, outfile):

# Create internal representation of document from infile
parser = etree.XMLParser(remove_comments=False)
tree = etree.parse(infile, parser=parser)
head = tree.getroot()

# print 'DEBUG: Pre tree structure cleanup...'
# print_tree(head, 0, 3)

if 'article' in head.tag:
head.tag = 'book'
# Clear attributes
for attrib in head.attrib.keys():
head.attrib.pop(attrib, None)
if head.attrib.items() != []:
print 'Error: Section attributes not removed. ', head.attrib.items(), ' items remain -- ', head.attrib.keys()
sys.exit(-5)
else:
print 'Toc file contains ', head.tag, 'tag, not <article>'
sys.exit(-4)

# Traverse tree sections, removing nodes as needed
traverse_clean_sections(head)

# Eliminate first section, placeholder for document title
eliminate_top_section(head)
# Traverse remaining top level <section> and convert to <chapter>
transform_head_sections(head)

# print 'DEBUG: Post tree structure cleanup...'
# print_tree(head, 0, 2)
# Persist updates to output file
tree.write(outfile)


def remove_book_tags(old_file, new_file):
with open(old_file, 'r') as input:
with open(new_file, 'wb') as output:
for line in input:
if '<book' not in line and '</book>' not in line:
output.write(line)

def insert_toc_into_book(toc_file, book_file):
book_file_bak = book_file+'.bak'
shutil.copy2(book_file, book_file_bak)
key_string = '<!--TBD-->'
inserted_toc = False

with open(book_file_bak, 'r') as input:
with open(book_file, 'wb') as output:
for line in input:
if key_string not in line:
output.write(line)
else:
inserted_toc = True
# Write toc_file contents
with open(toc_file, 'r') as input_toc:
for line_toc in input_toc:
output.write(line_toc)
if not inserted_toc:
print 'Error: key string of "', key_string, '" not found in ', book_file
sys.exit(-7)

def build_revhistory(book_file):
# Variables for formating git log
log_format = '%h%x01%an%x01%ad%x01%s%x02'
log_fields = ['id', 'author', 'date', 'subject']

# Retrieve log
pipe = Popen('git log --date=iso --format="%s" -- . .' % log_format, shell=True, stdout=PIPE)
log, _ = pipe.communicate()
# Substitute for problem characters: &, <, >
log = log.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
# Remove newlines, trailing end-of-record (0x02), and then split at end-of-record
log = log.replace('\n','').strip('\x02').split('\x02')
# Split records into individual fields
log = [row.split('\01') for row in log]
# Create dictionary using field names
log = [dict(zip(log_fields, row)) for row in log]

# Format log into revision history
revision = '<revhistory>\n'
for entry in log:
revision = revision + '<revision><date>' + entry['date'].split(' ')[0] + '</date><revdescription><para>' +\
entry['subject'] + ' (' + entry['id'] + ')</para></revdescription></revision>\n'
revision = revision + '</revhistory>\n'

# Update file
rev_str = '<revhistory>TBD</revhistory>'
update_file(book_file, rev_str, revision)

def main(argv):
master_git_url = 'https://github.com/OpenPOWERFoundation/Docs-Master.git'
template_git_url = 'https://github.com/OpenPOWERFoundation/Docs-Template.git'
html_dir = ''
build_dir = ''
db_dir = ''
master_dir = ''
template_dir = ''
toc_file = master_doc+'.xml'

try:
opts, args = getopt.getopt(argv,"hs:b:d:m:t:",["htmldir","builddir=","docbookdir=","masterdir=","templatedir="])
except getopt.GetoptError:
print 'Invalid option specified. Usage:'
print ' opf_html2db.py -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>'
sys.exit(-1)
for opt, arg in opts:
if opt == '-h':
print 'opf_hmtl2db.py -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>'
sys.exit(0)
elif opt in ("-s", "--htmldir"):
html_dir = arg
elif opt in ("-b", "--builddir"):
build_dir = arg
elif opt in ("-d", "--docbookdir"):
db_dir = arg
elif opt in ("-m", "--masterdir"):
master_dir = arg
elif opt in ("-t", "--templatedir"):
template_dir = arg

# Verify html directory, error if not found
if not os.path.exists(html_dir):
print 'ERROR: ' + html_dir + ' does not exist. Please specify path to directory containing single html file.'
sys.exit(-11)

# Generate path to single file
# NOTE: assumption is that file name is always "index.html" (master_doc). If this doesn't prove true, may need to use variable.
html_file_src = os.path.join(html_dir, master_doc + '.html')

if not os.path.isfile(html_file_src):
print 'ERROR: ' + html_file_src + ' does not exist. Please verify path to single html file and file name.'
sys.exit(-12)

# Convert html file to xml and place in db directory
if not os.path.exists(db_dir):
print 'Making docbook build directory ' + db_dir
os.path.makedirs(db_dir)

db_file = os.path.join(db_dir, project + '.xml')
if os.path.exists(db_file):
os.remove(db_file)

# Clean up herold html output
print 'Cleaning up html file before processing'
html_file = os.path.join(db_dir, master_doc + '.html')
html_file_tmp1 = html_file + '.tmp1'
shutil.copy2(html_file_src, html_file)
cleanup_html(html_file, html_file_tmp1)

print 'Converting html file to XML...'
print subprocess.check_output(['herold', '-i', html_file_tmp1, '-o', db_file])
# Clone a new Master Directory
print 'Cloning new Docs-Master directory...'
if os.path.exists(master_dir):
shutil.rmtree(master_dir)
Repo.clone_from(master_git_url, master_dir)
# Clone a new Template Directory
print 'Cloning new Docs-Template directory...'
if os.path.exists(template_dir):
shutil.rmtree(template_dir)
Repo.clone_from(template_git_url, template_dir)
# Create the new XML file *****
rst_template_dir = os.path.join(template_dir, 'rst_template')
full_toc_file = os.path.join(rst_template_dir, toc_file)
shutil.copy2(db_file, full_toc_file)
book_file = os.path.join(rst_template_dir, 'bk_main.xml')
# Update all file in opf_docbook_settings with tag/value combinations specified
print 'Updating Docbook files with settings from conf.py...'
for f in opf_docbook_settings.keys():
filename = os.path.join(rst_template_dir, f)
tags = opf_docbook_settings[f]

for tag in tags:
value = opf_docbook_settings[f][tag]
if value != '':
new_str = '<'+tag+'>'+value+'</'+tag+'>'
else:
new_str = ''

old_str = '<'+tag+'>TBD</'+tag+'>'
update_file(filename, old_str, new_str)
# Parse TOC file, convert high level tag to "book" and write back out to .tmp1 file
print 'Cleaning up Docbook file structure...'
full_toc_file_tmp1 = full_toc_file+'.tmp1'
full_toc_file_tmp2 = full_toc_file+'.tmp2'
full_toc_file_tmp3 = full_toc_file+'.tmp3'

# Walk document correcting XML errors
cleanup_xml( full_toc_file, full_toc_file_tmp1 )
# Remove extraneous sections
convert_structure( full_toc_file_tmp1, full_toc_file_tmp2 )
# Eliminate <book> and <title> tags in .tmp1 and write to .tmp2 file
remove_book_tags(full_toc_file_tmp2, full_toc_file_tmp3)

# Update link to first file
insert_toc_into_book(full_toc_file_tmp3, book_file)
# Create revision history from Git Log
print 'Building document revision history from git log...'
build_revhistory(book_file)

# TODO: Remove this hack after rst_template bk_main gets updated
update_file(book_file, 'xmlns:xlink', 'xmlns:xl')
# Perform build of Docbook
print 'Building Docbook PDF and HTML output in Maven...'
maven_log_file = 'build.log'
maven_build = 'cd ' + rst_template_dir + '; mvn generate-sources 2>&1 | tee ' + maven_log_file + ''
pipe = Popen(maven_build, shell=True)
log, err = pipe.communicate()
if pipe.returncode != 0:
print "Build failed with return code:%s" % pipe.returncode
print "See %s/build.log for more details" & rst_template_dir
# Copy output to better location
print 'Copying build output...'
bld_out_dir = os.path.join(rst_template_dir, 'target/docbkx/webhelp')
html_head = os.path.join(bld_out_dir, opf_docbook_settings['pom.xml']['webhelpDirname'] + '/index.html')
if os.path.exists(bld_out_dir) and os.path.exists(html_head):
doc_dir = os.path.join(build_dir, 'docbook/opf_docbook')
if os.path.exists(doc_dir):
shutil.rmtree(doc_dir)
shutil.copytree(bld_out_dir, doc_dir)
print "Build successful. Output files located in %s" % os.path.join(doc_dir, opf_docbook_settings['pom.xml']['webhelpDirname'])
sys.exit(0)

else:
print "Docbook build failed. Check logfile %s for details." % os.path.join(rst_template_dir, maven_log_file)
sys.exit(-10)

if __name__ == "__main__":
main(sys.argv[1:])
Loading…
Cancel
Save