| /************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| |
| |
| package org.openoffice.xmerge.converter.xml.sxw.wordsmith; |
| |
| import org.w3c.dom.*; |
| |
| import java.io.IOException; |
| import java.util.Enumeration; |
| |
| import org.openoffice.xmerge.Document; |
| import org.openoffice.xmerge.ConvertData; |
| import org.openoffice.xmerge.ConvertException; |
| import org.openoffice.xmerge.DocumentDeserializer; |
| import org.openoffice.xmerge.converter.xml.OfficeConstants; |
| import org.openoffice.xmerge.converter.palm.PalmDB; |
| import org.openoffice.xmerge.converter.palm.Record; |
| import org.openoffice.xmerge.converter.palm.PdbDecoder; |
| import org.openoffice.xmerge.converter.palm.PalmDocument; |
| import org.openoffice.xmerge.converter.xml.sxw.SxwDocument; |
| |
| import java.util.Vector; |
| import java.io.ByteArrayInputStream; |
| |
| import org.openoffice.xmerge.converter.xml.*; |
| import org.openoffice.xmerge.util.Debug; |
| import org.openoffice.xmerge.util.XmlUtil; |
| |
| /** |
| * <p>WordSmith implementation of |
| * org.openoffice.xmerge.DocumentDeserializer |
| * for the {@link |
| * org.openoffice.xmerge.converter.xml.sxw.wordsmith.PluginFactoryImpl |
| * PluginFactoryImpl}.</p> |
| * |
| * The <code>deserialize</code> method uses a |
| * <code>DocDecoder</code> to read the WordSmith format into a |
| * <code>String</code> object, then it calls <code>buildDocument</code> |
| * to create a <code>SxwDocument</code> object from it. |
| * |
| * @author Herbie Ong, David Proulx |
| */ |
| public final class DocumentDeserializerImpl |
| implements DOCConstants, OfficeConstants, DocumentDeserializer { |
| |
| /** A Decoder object for decoding WordSmith format. */ |
| private WSDecoder decoder = null; |
| |
| WseFontTable fontTable = null; |
| WseColorTable colorTable = null; |
| StyleCatalog styleCat = null; |
| StyleCatalog oldStyleCat = null; |
| |
| /** A <code>ConvertData</code> object assigned to this object. */ |
| private ConvertData cd = null; |
| |
| |
| /** |
| * Constructor that assigns the given <code>ConvertData</code> |
| * to the object. |
| * |
| * @param cd A <code>ConvertData</code> object to read data for |
| * the conversion process by the deserialize method. |
| */ |
| public DocumentDeserializerImpl(ConvertData cd) { |
| this.cd = cd; |
| } |
| |
| |
| /** |
| * Convert the given <code>ConvertData</code> into a |
| * <code>SxwDocument</code> object. |
| * |
| * @return Resulting <code>Document</code> object. |
| * |
| * @throws ConvertException If any conversion error occurs. |
| * @throws IOException If any I/O error occurs. |
| */ |
| public Document deserialize() throws ConvertException, |
| IOException { |
| return deserialize(null, cd); |
| } |
| |
| |
| public Document deserialize(Document origDoc, ConvertData cd) |
| throws IOException { |
| |
| Document doc = null; |
| PalmDocument palmDoc = null; |
| Enumeration e = cd.getDocumentEnumeration(); |
| |
| while(e.hasMoreElements()) { |
| palmDoc = (PalmDocument) e.nextElement(); |
| PalmDB pdb = palmDoc.getPdb(); |
| Record[] recs = pdb.getRecords(); |
| decoder = new WSDecoder(); |
| Wse[] b = decoder.parseDocument(recs); |
| String docName = palmDoc.getName(); |
| doc = buildDocument(docName, b, origDoc); |
| } |
| return doc; |
| } |
| |
| |
| /** |
| * Temporary method to read existing <code>StyleCatalog</code> |
| * as a starting point. |
| * |
| * @param parentDoc The parent <code>Document</code>. |
| */ |
| private void readStyleCatalog(Document parentDoc) { |
| Element rootNode = null; |
| try { |
| java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); |
| parentDoc.write(bos); |
| SxwDocument sxwDoc = new SxwDocument("old"); |
| sxwDoc.read(new ByteArrayInputStream(bos.toByteArray())); |
| org.w3c.dom.Document domDoc = sxwDoc.getContentDOM(); |
| |
| String families[] = new String[3]; |
| families[0] = "text"; |
| families[1] = "paragraph"; |
| families[2] = "paragraph"; |
| Class classes[] = new Class[3]; |
| classes[0] = TextStyle.class; |
| classes[1] = ParaStyle.class; |
| classes[2] = TextStyle.class; |
| |
| NodeList nl = domDoc.getElementsByTagName(TAG_OFFICE_STYLES); |
| oldStyleCat.add(nl.item(0), families, classes, null, false); |
| nl = domDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); |
| oldStyleCat.add(nl.item(0), families, classes, null, false); |
| nl = domDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES); |
| oldStyleCat.add(nl.item(0), families, classes, null, false); |
| |
| } catch (Exception e) { |
| Debug.log(Debug.ERROR, "", e); |
| } |
| |
| } |
| |
| |
| /** |
| * Given an array of paragraph <code>Style</code> objects, see if |
| * there is exactly one which matches the text formatting |
| * <code>Style</code> of <code>tStyle</code>. |
| * |
| * @param paraStyles An array of paragraph <code>Style</code> |
| * objects. |
| * @param tStyle Text <code>Style</code> to match. |
| * |
| * @return The paragraph <code>Style</code> that matches. |
| */ |
| private ParaStyle matchParaByText(Style paraStyles[], TextStyle tStyle) { |
| int matchIndex = -1; |
| int matchCount = 0; |
| Style txtMatches[] = (Style[]) oldStyleCat.getMatching(tStyle); |
| if (txtMatches.length >= 1) { |
| for (int j = 0; j < txtMatches.length; j++) { |
| TextStyle t = (TextStyle)txtMatches[j]; |
| |
| if (!t.getFamily().equals("paragraph")) |
| continue; |
| |
| for (int k = 0; k < paraStyles.length; k++) { |
| if (t.getName().equals(paraStyles[k].getName())) { |
| matchCount++; |
| matchIndex = k; |
| } |
| } |
| } |
| } |
| if (matchCount == 1) |
| return (ParaStyle)paraStyles[matchIndex]; |
| else return null; |
| } |
| |
| |
| /** |
| * Take a <code>String</code> of text and turn it into a sequence |
| * of <code>Node</code> objects. |
| * |
| * @param text <code>String</code> of text. |
| * @param parentDoc Parent <code>Document</code>. |
| * |
| * @return Array of <code>Node</code> objects. |
| */ |
| private Node[] parseText(String text, org.w3c.dom.Document parentDoc) { |
| Vector nodeVec = new Vector(); |
| |
| // Break up the text from the WordSmith text run into Open |
| // Office text runs. There may be more runs in OO because |
| // runs of 2 or more spaces map to nodes. |
| while ((text.indexOf(" ") != -1) || (text.indexOf("\t") != 1)) { |
| |
| // Find the indices of tabs and multiple spaces, and |
| // figure out which of them occurs first in the string. |
| int spaceIndex = text.indexOf(" "); |
| int tabIndex = text.indexOf("\t"); |
| if ((spaceIndex == -1) && (tabIndex == -1)) |
| break; // DJP This should not be necessary. What is wrong |
| // with the while() stmt up above? |
| int closerIndex; // Index of the first of these |
| if (spaceIndex == -1) |
| closerIndex = tabIndex; |
| else if (tabIndex == -1) |
| closerIndex = spaceIndex; |
| else |
| closerIndex = (spaceIndex > tabIndex) ? tabIndex : spaceIndex; |
| |
| // If there is any text prior to the first occurrence of a |
| // tab or spaces, create a text node from it, then chop it |
| // off the string we're working with. |
| if (closerIndex > 0) { |
| String beginningText = text.substring(0, closerIndex); |
| Text textNode = parentDoc.createTextNode(beginningText); |
| nodeVec.addElement(textNode); |
| log("<TEXT>"); |
| log(beginningText); |
| log("</TEXT>"); |
| } |
| text = text.substring(closerIndex); |
| |
| // Handle either tab character or space sequence by creating |
| // an element for it, and then chopping out the text that |
| // represented it in "text". |
| if (closerIndex == tabIndex) { |
| Element tabNode = parentDoc.createElement(TAG_TAB_STOP); |
| nodeVec.add(tabNode); |
| text = text.substring(1); // tab is always a single character |
| log("<TAB/>"); |
| } else { |
| // Compute length of space sequence. |
| int nrSpaces = 2; |
| while ((nrSpaces < text.length()) |
| && text.substring(nrSpaces, nrSpaces + 1).equals(" ")) |
| nrSpaces++; |
| |
| Element spaceNode = parentDoc.createElement(TAG_SPACE); |
| spaceNode.setAttribute(ATTRIBUTE_SPACE_COUNT, new Integer(nrSpaces).toString()); |
| nodeVec.add(spaceNode); |
| text = text.substring(nrSpaces); |
| log("<SPACE count=\"" + nrSpaces + "\" />"); |
| } |
| } |
| |
| // No more tabs or space sequences. If there's any remaining |
| // text create a text node for it. |
| if (text.length() > 0) { |
| Text textNode = parentDoc.createTextNode(text); |
| nodeVec.add(textNode); |
| log("<TEXT>"); |
| log(text); |
| log("</TEXT>"); |
| } |
| |
| // Now create and populate an array to return the nodes in. |
| Node nodes[] = new Node[nodeVec.size()]; |
| for (int i = 0; i < nodeVec.size(); i++) |
| nodes[i] = (Node)nodeVec.elementAt(i); |
| return nodes; |
| } |
| |
| |
| /** |
| * Parses the text content of a WordSmith format and builds a |
| * <code>SXWDocument</code>. |
| * |
| * @param docName <code>Document</code> name |
| * @param data Text content of WordSmith format |
| * |
| * @return Resulting <code>SXWDocument</code> object. |
| * |
| * @throws IOException If any I/O error occurs. |
| */ |
| private SxwDocument buildDocument(String docName, Wse[] data, Document origDoc) |
| throws IOException { |
| |
| // create minimum office xml document. |
| SxwDocument sxwDoc = new SxwDocument(docName); |
| sxwDoc.initContentDOM(); |
| |
| org.w3c.dom.Document doc = sxwDoc.getContentDOM(); |
| |
| // Grab hold of the office:body tag, |
| // Assume there should be one. |
| // This is where top level paragraphs will append to. |
| NodeList list = doc.getElementsByTagName(TAG_OFFICE_BODY); |
| Node bodyNode = list.item(0); |
| |
| styleCat = new StyleCatalog(50); |
| oldStyleCat = new StyleCatalog(50); |
| if (origDoc != null) |
| readStyleCatalog(origDoc); |
| |
| Element currPara = null; |
| ParaStyle currParaStyle = null; |
| int newTextStyleNr = 0; |
| int newParaStyleNr = 0; |
| |
| // Now write out the document body by running through |
| // the list of WordSmith elements and processing each one |
| // in turn. |
| for (int i = 0; i < data.length; i++) { |
| |
| if (data[i].getClass() == WsePara.class) { |
| |
| currPara = doc.createElement(TAG_PARAGRAPH); |
| log("</PARA>"); |
| log("<PARA>"); |
| |
| WsePara p = (WsePara)data[i]; |
| |
| // Save info about the first text run, if there is one. |
| WseTextRun firstTextRun = null; |
| |
| if ((data.length >= i + 2) |
| && (data[i+1].getClass() == WseTextRun.class)) |
| firstTextRun = (WseTextRun)data[i+1]; |
| |
| Style matches[] = oldStyleCat.getMatching(p.makeStyle()); |
| |
| // See if we can find a unique match in the catalog |
| // of existing styles from the original document. |
| ParaStyle pStyle = null; |
| if (matches.length == 1) { |
| pStyle = (ParaStyle)matches[0]; |
| log("using an existing style"); |
| } else if ((matches.length > 1) && (firstTextRun != null)) { |
| pStyle = matchParaByText(matches, firstTextRun.makeStyle()); |
| log("resolved a para by looking @ text"); |
| } |
| |
| // If nothing found so far, try looking in the catalog |
| // of newly-created styles. |
| // DJP FIXME: if we need to add two para styles with the |
| // same para formatting info but different default text |
| // styles, this won't work! |
| if (pStyle == null) { |
| log("had " + matches.length + " matches in old catalog"); |
| matches = styleCat.getMatching(p.makeStyle()); |
| if (matches.length == 0) { |
| pStyle = p.makeStyle(); |
| String newName = new String("PPP" + ++newParaStyleNr); |
| pStyle.setName(newName); |
| styleCat.add(pStyle); |
| // DJP: write in the text format info here |
| log("created a new style"); |
| } else if (matches.length == 1) { |
| pStyle = (ParaStyle)matches[0]; |
| log("re-using a new style"); |
| } else if (firstTextRun != null) { |
| pStyle = matchParaByText(matches, firstTextRun.makeStyle()); |
| if (pStyle != null) { |
| log("resolved a (new) para by looking @ text"); |
| } else |
| log("Hey this shouldn't happen! - nr of matches is " |
| + matches.length); |
| } |
| } |
| |
| if (pStyle == null) |
| log("Unable to figure out a para style"); |
| |
| // Figured out a style to use. Specify the style in this |
| // paragraph's attributes. |
| currPara.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, pStyle.getName()); |
| |
| bodyNode.appendChild(currPara); |
| currParaStyle = pStyle; |
| } else if (data[i].getClass() == WseTextRun.class) { |
| WseTextRun tr = (WseTextRun)data[i]; |
| TextStyle trStyle = null; |
| Node trNodes[] = parseText(tr.getText(), doc); |
| |
| // First see if the formatting of this text run matches |
| // the default text formatting for this paragraph. If |
| // it does, then just make the text node(s) children of |
| // the current paragraph. |
| Style[] cps = new Style[1]; |
| cps[0] = currParaStyle; |
| if (matchParaByText(cps, tr.makeStyle()) != null) { |
| for (int ii = 0; ii < trNodes.length; ii++) { |
| currPara.appendChild(trNodes[ii]); |
| } |
| continue; |
| } |
| |
| // Check for existing, matching styles in the old style |
| // catalog. If exactly one is found, use it. Otherwise, |
| // check the new style catalog, and either use the style |
| // found or add this new one to it. |
| Style matches[] = oldStyleCat.getMatching(tr.makeStyle()); |
| if (matches.length == 1) |
| trStyle = (TextStyle)matches[0]; |
| else { |
| matches = styleCat.getMatching(tr.makeStyle()); |
| if (matches.length == 0) { |
| trStyle = tr.makeStyle(); |
| String newName = new String("TTT" + ++newTextStyleNr); |
| trStyle.setName(newName); |
| styleCat.add(trStyle); |
| } else if (matches.length == 1) |
| trStyle = (TextStyle)matches[0]; |
| else |
| log("multiple text style matches from new catalog"); |
| } |
| |
| // Create a text span node, set the style attribute, make the |
| // text node(s) its children, and append it to current paragraph's |
| // list of children. |
| Element textSpanNode = doc.createElement(TAG_SPAN); |
| textSpanNode.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, trStyle.getName()); |
| for (int ii = 0; ii < trNodes.length; ii++) { |
| textSpanNode.appendChild(trNodes[ii]); |
| } |
| currPara.appendChild(textSpanNode); |
| log("</SPAN>"); |
| } |
| |
| else if (data[i].getClass() == WseFontTable.class) { |
| fontTable = (WseFontTable)data[i]; |
| } |
| |
| else if (data[i].getClass() == WseColorTable.class) { |
| colorTable = (WseColorTable)data[i]; |
| } |
| } |
| |
| |
| //NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT); |
| NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT_CONTENT); |
| Node rootNode = r.item(0); |
| |
| // read the original document |
| org.w3c.dom.NodeList nl; |
| if (origDoc != null) { |
| java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); |
| origDoc.write(bos); |
| SxwDocument origSxwDoc = new SxwDocument("old"); |
| origSxwDoc.read(new ByteArrayInputStream(bos.toByteArray())); |
| org.w3c.dom.Document origDomDoc = origSxwDoc.getContentDOM(); |
| |
| XmlUtil xu = new XmlUtil(); |
| org.w3c.dom.DocumentFragment df; |
| org.w3c.dom.Node newNode; |
| |
| // copy font declarations from original document to the new document |
| nl = origDomDoc.getElementsByTagName(TAG_OFFICE_FONT_DECLS); |
| df = doc.createDocumentFragment(); |
| newNode = xu.deepClone(df, nl.item(0)); |
| rootNode.insertBefore(newNode, bodyNode); |
| |
| // copy style catalog from original document to the new document |
| nl = origDomDoc.getElementsByTagName(TAG_OFFICE_STYLES); |
| df = doc.createDocumentFragment(); |
| newNode = xu.deepClone(df, nl.item(0)); |
| rootNode.insertBefore(newNode, bodyNode); |
| |
| nl = origDomDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); |
| df = doc.createDocumentFragment(); |
| newNode = xu.deepClone(df, nl.item(0)); |
| rootNode.insertBefore(newNode, bodyNode); |
| |
| nl = origDomDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES); |
| df = doc.createDocumentFragment(); |
| newNode = xu.deepClone(df, nl.item(0)); |
| rootNode.insertBefore(newNode, bodyNode); |
| } |
| |
| // Original document not specified. We need to add font declarations. |
| // DJP: this might just be for debugging. Merger will probably put |
| // the "real" ones in. |
| // DJP: if really doing it this way, do it right: gather font names |
| // from style catalog(s). |
| else { |
| org.w3c.dom.Node declNode; |
| |
| log("<FONT-DECLS/>"); |
| |
| declNode = doc.createElement(TAG_OFFICE_FONT_DECLS); |
| rootNode.insertBefore(declNode, bodyNode); |
| org.w3c.dom.Element fontNode; |
| |
| fontNode = doc.createElement(TAG_STYLE_FONT_DECL); |
| fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arial"); |
| fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arial"); |
| fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable"); |
| declNode.appendChild(fontNode); |
| |
| fontNode = doc.createElement(TAG_STYLE_FONT_DECL); |
| fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arioso"); |
| fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arioso"); |
| fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable"); |
| declNode.appendChild(fontNode); |
| } |
| |
| |
| // Now add any new styles we have created in this document. |
| nl = doc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); |
| Node autoStylesNode = nl.item(0); |
| if (autoStylesNode == null) { |
| autoStylesNode = doc.createElement(TAG_OFFICE_AUTOMATIC_STYLES); |
| log("<OFFICE-AUTOMATIC-STYLES/>"); |
| rootNode.insertBefore(autoStylesNode, bodyNode); |
| } |
| |
| Node newStyleCatNode = styleCat.writeNode(doc, "dummy"); |
| nl = newStyleCatNode.getChildNodes(); |
| int nNodes = nl.getLength(); |
| for (int i = 0; i < nNodes; i++) { |
| autoStylesNode.appendChild(nl.item(0)); |
| } |
| |
| oldStyleCat.dumpCSV(true); |
| styleCat.dumpCSV(true); |
| return sxwDoc; |
| } |
| |
| |
| /** |
| * Sends message to the log object. |
| * |
| * @param str Debug message. |
| */ |
| private void log(String str) { |
| |
| Debug.log(Debug.TRACE, str); |
| } |
| |
| |
| /* |
| public static void main(String args[]) { |
| |
| // DocumentDeserializerImpl d = new DocumentDeserializerImpl(new InputStream()); |
| |
| Node nodes[] = parseText("Tab here:\tThen some more text"); |
| } |
| */ |
| } |
| |