blob: a68629e6df2245075bc1e9dae91e9360e372df91 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "DFPlatform.h"
#include "WordConverter.h"
#include "WordBookmark.h"
#include "WordField.h"
#include "WordStyles.h"
#include "WordSheet.h"
#include "WordNotes.h"
#include "WordNumbering.h"
#include "WordSection.h"
#include "WordSettings.h"
#include "WordObjects.h"
#include "WordLists.h"
#include "WordGC.h"
#include "WordLenses.h"
#include "WordCaption.h"
#include "WordWhitespace.h"
#include "WordTheme.h"
#include "OPC.h"
#include "DFDOM.h"
#include "DFHTML.h"
#include "DFHTMLNormalization.h"
#include "DFBDT.h"
#include "CSS.h"
#include "CSSProperties.h"
#include "CSSLength.h"
#include "CSSSelector.h"
#include "CSSClassNames.h"
#include "CSSSheet.h"
#include "CSSStyle.h"
#include "DFXML.h"
#include "DFString.h"
#include "DFCharacterSet.h"
#include "DFCommon.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
static int isWhitespaceRun(DFNode *run)
{
for (DFNode *child = run->first; child != NULL; child = child->next) {
switch (child->tag) {
case WORD_RPR:
break;
case WORD_T: {
char *str = DFNodeTextToString(child);
int isWhitespace = DFStringIsWhitespace(str);
free(str);
if (!isWhitespace)
return 0;
break;
}
default:
return 0;
}
}
return 1;
}
int Word_isFigureParagraph(DFNode *p)
{
// A paragraph is a figure if it contains only a single run, and that run contains a drawing
if ((p == NULL) || (p->tag != WORD_P))
return 0;;
DFNode *run = NULL;
int runCount = 0;
for (DFNode *child = p->first; child != NULL; child = child->next) {
if (child->tag == WORD_R) {
if (isWhitespaceRun(child))
continue;
run = child;
runCount++;
}
}
if (runCount != 1)
return 0;
for (DFNode *child = run->first; child != NULL; child = child->next) {
switch (child->tag) {
case WORD_DRAWING:
case WORD_OBJECT:
case WORD_PICT:
return 1;
}
}
return 0;
}
int Word_isEquationParagraph(DFNode *p)
{
if ((p == NULL) || (p->tag != WORD_P))
return 0;
for (DFNode *child = p->first; child != NULL; child = child->next) {
if (child->tag == MATH_OMATHPARA)
return 1;
}
return 0;
}
static int attributesEqual(DFNode *elemA, DFNode *elemB)
{
if (elemA->attrsCount != elemB->attrsCount)
return 0;
int count = elemA->attrsCount;
for (int ai = 0; ai < count; ai++) {
DFAttribute *attrA = &elemA->attrs[ai];
int found = 0;
for (int bi = 0; bi < count; bi++) {
DFAttribute *attrB = &elemB->attrs[bi];
if (attrA->tag == attrB->tag) {
if (strcmp(attrA->value,attrB->value))
return 0;
found = 1;
break;
}
}
if (!found)
return 0;
}
return 1;
}
static int nodesEqual(DFNode *a, DFNode *b)
{
if ((a == NULL) && (b == NULL))
return 1;
if ((a == NULL) || (b == NULL))
return 0;
if (a->tag != b->tag)
return 0;
if (a->tag < MIN_ELEMENT_TAG)
return 0;;
// First check if the number and type of children are the same
DFNode *aChild = a->first;
DFNode *bChild = b->first;
while ((aChild != NULL) || (bChild != NULL)) {
if ((aChild != NULL) && (bChild == NULL))
return 0;
if ((aChild == NULL) && (bChild != NULL))
return 0;
if (aChild->tag != bChild->tag)
return 0;
aChild = aChild->next;
bChild = bChild->next;
}
// Next check the attributes
if (!attributesEqual(a,b))
return 0;
// Now check the *content* of the children. We do this after the above as it is more expensive.
aChild = a->first;
bChild = b->first;
while ((aChild != NULL) || (bChild != NULL)) {
if (!nodesEqual(aChild,bChild))
return 0;
aChild = aChild->next;
bChild = bChild->next;
}
return 1;
}
static void Word_mergeRunsRecursive(DFNode *node)
{
DFNode *current = node->first;
while (current != NULL) {
DFNode *next = current->next;
if ((current->tag == WORD_R) && (next != NULL) && (next->tag == WORD_R)) {
DFNode *currentRPr = DFChildWithTag(current,WORD_RPR);
DFNode *nextRPr = DFChildWithTag(next,WORD_RPR);
if (nodesEqual(currentRPr,nextRPr)) {
while (next->first != NULL) {
if (next->first->tag == WORD_RPR)
DFRemoveNode(next->first);
else
DFAppendChild(current,next->first);
}
DFRemoveNode(next);
continue;
}
}
current = next;
}
for (current = node->first; current != NULL; current = current->next)
Word_mergeRunsRecursive(current);
}
static void Word_mergeRuns(WordPackage *package)
{
if (package->document != NULL)
Word_mergeRunsRecursive(package->document->docNode);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// //
// HTML pre- and post-processing //
// //
////////////////////////////////////////////////////////////////////////////////////////////////////
static void Word_addContentParts(DFNode *child, const char *content, WordCaption *caption)
{
if (content == NULL)
return;;
DFNode *nextSibling = child->first;
DFArray *parts = CSSParseContent(content);
for (size_t i = 0; i < DFArrayCount(parts); i++) {
ContentPart *part = DFArrayItemAt(parts,i);
switch (part->type) {
case ContentPartString: {
DFNode *text = DFCreateTextNode(child->doc,part->value);
if (strlen(part->value) > 0) {
DFNode *span = DFCreateElement(child->doc,HTML_SPAN);
DFAppendChild(span,text);
DFInsertBefore(child,span,nextSibling);
}
break;
}
case ContentPartCounter: {
if (DFStringEquals(part->value,"figure")) {
DFNode *span = DFCreateElement(child->doc,HTML_SPAN);
DFSetAttribute(span,HTML_CLASS,DFFieldClass);
DFCreateChildTextNode(span," SEQ Figure \\* ARABIC ");
DFInsertBefore(child,span,nextSibling);
caption->number = span;
}
else if (DFStringEquals(part->value,"table")) {
DFNode *span = DFCreateElement(child->doc,HTML_SPAN);
DFSetAttribute(span,HTML_CLASS,DFFieldClass);
DFCreateChildTextNode(span," SEQ Table \\* ARABIC ");
DFInsertBefore(child,span,nextSibling);
caption->number = span;
}
break;
default:
break;
}
}
}
DFArrayRelease(parts);
}
static void Word_preProcessHTML(WordConverter *word, DFNode *node)
{
switch (node->tag) {
case HTML_TABLE:
case HTML_FIGURE: {
DFNode *next;
for (DFNode *child = node->first; child != NULL; child = next) {
next = child->next;
if ((child->tag != HTML_CAPTION) && (child->tag != HTML_FIGCAPTION))
continue;
WordCaption *caption = WordCaptionNew(child);
WordObjectsSetCaption(word->objects,caption,node);
caption->contentStart = child->first;
WordCaptionRelease(caption);
const char *className = DFGetAttribute(child,HTML_CLASS);
CSSStyle *style;
if (child->tag == HTML_CAPTION)
style = CSSSheetLookupElement(word->styleSheet,"caption",className,0,0);
else
style = CSSSheetLookupElement(word->styleSheet,"figcaption",className,0,0);
CSSProperties *before = CSSStyleBefore(style);
if (CSSGet(before,"content") != NULL)
Word_addContentParts(child,CSSGet(before,"content"),caption);
child->tag = HTML_P;
DFSetAttribute(child,HTML_CLASS,"Caption");
DFInsertBefore(node->parent,child,node->next);
Word_preProcessHTML(word,child);
}
// The HTML normalization process ensures that apart from the <figcaption> element,
// all children of a <figure> are paragraphs or containers. Currently the editor only
// lets you create figures that contain a single image, so it's always a single
// paragraph. Since the HTML <figure> element gets mapped to a single <w:p> element
// by WordParagraphLens, we want to make sure it only contains inline children.
for (DFNode *child = node->first; child != NULL; child = next) {
next = child->next;
if (HTML_isParagraphTag(child->tag))
DFRemoveNodeButKeepChildren(child);
}
// FIXME: Handle <div>, <pre>, lists, tables etc which could also theoretically
// exist inside the <figure> element
break;
}
case HTML_NAV: {
const char *className = DFGetAttribute(node,HTML_CLASS);
const char *instr = NULL;
if (DFStringEquals(className,DFTableOfContentsClass))
instr = " TOC \\o \"1-3\" ";
else if (DFStringEquals(className,DFListOfFiguresClass))
instr = " TOC \\c \"Figure\" ";
else if (DFStringEquals(className,DFListOfTablesClass))
instr = " TOC \\c \"Table\" ";
if (instr != NULL) {
DFNode *p = DFCreateElement(word->html,HTML_P);
DFNode *field = DFCreateChildElement(p,HTML_SPAN);
DFSetAttribute(field,HTML_CLASS,DFFieldClass);
DFCreateChildTextNode(field,instr);
DFInsertBefore(node->parent,p,node);
DFRemoveNode(node);
}
break;
}
}
DFNode *next;
for (DFNode *child = node->first; child != NULL; child = next) {
next = child->next;
Word_preProcessHTML(word,child);
}
}
static void Word_preProcessHTMLDoc(WordConverter *word, DFDocument *doc)
{
WordPreProcessHTMLLists(word);
Word_preProcessHTML(word,doc->docNode);
}
static int isSeqField(DFNode *node)
{
if (node->tag != HTML_SPAN)
return 0;
if (!DFStringEquals(DFGetAttribute(node,HTML_CLASS),DFFieldClass))
return 0;
char *instr = DFNodeTextToString(node);
const char **args = Word_parseField(instr);
int result = (args[0] != NULL) && !strcmp(args[0],"SEQ");
free(args);
free(instr);
return result;
}
static DFNode *findSeqChild(DFNode *parent)
{
for (DFNode *child = parent->first; child != NULL; child = child->next) {
if (isSeqField(child))
return child;;
DFNode *result = findSeqChild(child);
if (result != NULL)
return result;
}
return NULL;
}
static void extractPrefixRecursive(DFNode *node, const char *counterName, DFBuffer *result,
int *foundSeq, int *foundContent)
{
if (isSeqField(node)) {
if (result->len > 0)
DFBufferFormat(result," ");
DFBufferFormat(result,"counter(%s)",counterName);
*foundSeq = 1;
DFRemoveNode(node);
return;
}
if (node->tag == DOM_TEXT) {
size_t valueLen = strlen(node->value);
size_t pos = 0;
if (*foundSeq) {
size_t offset = 0;
uint32_t ch;
do {
pos = offset;
ch = DFNextChar(node->value,&offset);
} while ((ch != 0) && (DFCharIsWhitespaceOrNewline(ch) || DFCharIsPunctuation(ch)));
}
else {
pos = valueLen;
}
if (pos == valueLen) {
if (result->len > 0)
DFBufferFormat(result," ");
char *quotedValue = DFQuote(node->value);
DFBufferFormat(result,"%s",quotedValue);
free(quotedValue);
DFRemoveNode(node);
if (*foundSeq)
*foundContent = 1;
return;
}
else if (pos > 0) {
char *first = DFSubstring(node->value,0,pos);
char *rest = DFSubstring(node->value,pos,valueLen);
if (result->len > 0)
DFBufferFormat(result," ");
char *quotedFirst = DFQuote(first);
DFBufferFormat(result,"%s",quotedFirst);
free(quotedFirst);
DFSetNodeValue(node,rest);
if (*foundSeq)
*foundContent = 1;
free(first);
free(rest);
return;
}
}
int wasEmpty = (node->first == NULL);
DFNode *next;
for (DFNode *child = node->first; child != NULL; child = next) {
next = child->next;
if (*foundContent)
break;
extractPrefixRecursive(child,counterName,result,foundSeq,foundContent);
}
int isEmpty = (node->first == NULL);
if ((node->tag == HTML_SPAN) && isEmpty && !wasEmpty)
DFRemoveNode(node);
}
static char *extractPrefix(DFNode *node, const char *counterName)
{
if (findSeqChild(node) == NULL)
return NULL;;
DFBuffer *result = DFBufferNew();
int foundSeq = 0;
int foundContent = 0;
extractPrefixRecursive(node,counterName,result,&foundSeq,&foundContent);
char *str = xstrdup(result->data);
DFBufferRelease(result);
return str;
}
static void Word_postProcessHTML(WordConverter *conv, DFNode *node)
{
DFNode *next;
for (DFNode *child = node->first; child != NULL; child = next) {
next = child->next;
switch (child->tag) {
case HTML_SPAN: {
const char *className = DFGetAttribute(child,HTML_CLASS);
if (DFStringEquals(className,DFBookmarkClass)) {
if (child->first != NULL)
next = child->first;
DFRemoveNodeButKeepChildren(child);
}
break;
}
case HTML_CAPTION: {
const char *counterName = NULL;
if ((child->prev != NULL) && (child->prev->tag == HTML_FIGURE) &&
(DFChildWithTag(child->prev,HTML_FIGCAPTION) == NULL)) {
child->tag = HTML_FIGCAPTION;
counterName = "figure";
DFAppendChild(child->prev,child);
}
else if ((child->prev != NULL) && (child->prev->tag == HTML_TABLE) &&
(DFChildWithTag(child->prev,HTML_CAPTION) == NULL)) {
counterName = "table";
DFInsertBefore(child->prev,child,child->prev->first);
}
else if ((child->next != NULL) && (child->next->tag == HTML_FIGURE) &&
(DFChildWithTag(child->next,HTML_FIGCAPTION) == NULL)) {
child->tag = HTML_FIGCAPTION;
counterName = "figure";
DFInsertBefore(child->next,child,child->next->first);
}
else if ((child->next != NULL) && (child->next->tag == HTML_TABLE) &&
(DFChildWithTag(child->next,HTML_CAPTION) == NULL)) {
counterName = "table";
DFSetAttribute(child,HTML_STYLE,"caption-side: top");
DFInsertBefore(child->next,child,child->next->first);
}
if (counterName != NULL) {
char *beforeText = extractPrefix(child,counterName);
if (beforeText != NULL) {
CSSStyle *style = CSSSheetLookupElement(conv->styleSheet,DFNodeName(child),NULL,1,0);
if (CSSGet(CSSStyleBefore(style),"content") == NULL) {
CSSPut(CSSStyleRule(style),"counter-increment",counterName);
CSSPut(CSSStyleBefore(style),"content",beforeText);
}
}
free(beforeText);
}
break;
}
case HTML_NAV: {
if (HTML_isParagraphTag(node->tag)) {
if (child->prev != NULL) {
DFNode *beforeP = DFCreateElement(conv->html,node->tag);
while (child->prev != NULL)
DFInsertBefore(beforeP,child->prev,beforeP->first);
DFInsertBefore(node->parent,beforeP,node);
}
DFInsertBefore(node->parent,child,node);
if ((node->first == NULL) ||
((node->first->tag == HTML_BR) && (node->first->next == NULL))) {
DFRemoveNode(node);
return;
}
next = NULL;
}
break;
}
}
}
for (DFNode *child = node->first; child != NULL; child = next) {
next = child->next;
Word_postProcessHTML(conv,child);
}
}
static void Word_postProcessHTMLDoc(WordConverter *conv)
{
WordPostProcessHTMLLists(conv);
Word_postProcessHTML(conv,conv->html->docNode);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// //
// WordConverter //
// //
////////////////////////////////////////////////////////////////////////////////////////////////////
static WordConverter *WordConverterNew(DFDocument *html, DFStorage *abstractStorage, WordPackage *package, const char *idPrefix)
{
WordConverter *converter = (WordConverter *)xcalloc(1,sizeof(WordConverter));
converter->html = DFDocumentRetain(html);
converter->abstractStorage = DFStorageRetain(abstractStorage);
assert(DFStorageFormat(converter->abstractStorage) == DFFileFormatHTML);
converter->idPrefix = (idPrefix != NULL) ? xstrdup(idPrefix) : xstrdup("word");
converter->package = WordPackageRetain(package);
converter->styles = WordSheetNew(converter->package->styles);
converter->numbering = WordNumberingNew(converter->package);
converter->theme = WordThemeNew(converter->package);
converter->mainSection = WordSectionNew();
converter->objects = WordObjectsNew(converter->package);
converter->footnotes = WordNoteGroupNewFootnotes(converter->package->footnotes);
converter->endnotes = WordNoteGroupNewEndnotes(converter->package->endnotes);
converter->supportedContentTypes = DFHashTableNew((DFCopyFunction)xstrdup,free);
DFHashTableAdd(converter->supportedContentTypes,"jpg","image/jpeg");
DFHashTableAdd(converter->supportedContentTypes,"jpeg","image/jpeg");
DFHashTableAdd(converter->supportedContentTypes,"tif","image/tiff");
DFHashTableAdd(converter->supportedContentTypes,"tiff","image/tiff");
DFHashTableAdd(converter->supportedContentTypes,"gif","image/gif");
DFHashTableAdd(converter->supportedContentTypes,"bmp","image/bmp");
DFHashTableAdd(converter->supportedContentTypes,"png","image/png");
converter->warnings = DFBufferNew();
return converter;
}
static void WordConverterFree(WordConverter *converter)
{
DFDocumentRelease(converter->html);
DFStorageRelease(converter->abstractStorage);
free(converter->idPrefix);
WordSheetFree(converter->styles);
WordNumberingFree(converter->numbering);
WordThemeFree(converter->theme);
WordSectionFree(converter->mainSection);
WordObjectsFree(converter->objects);
WordNoteGroupRelease(converter->footnotes);
WordNoteGroupRelease(converter->endnotes);
DFHashTableRelease(converter->supportedContentTypes);
DFBufferRelease(converter->warnings);
CSSSheetRelease(converter->styleSheet);
WordPackageRelease(converter->package);
free(converter);
}
DFNode *WordConverterCreateAbstract(WordGetData *get, Tag tag, DFNode *concrete)
{
DFNode *element = DFCreateElement(get->conv->html,tag);
if (concrete != NULL) {
char *idStr;
if (concrete->doc == get->conv->package->document)
idStr = DFFormatString("%s%u",get->conv->idPrefix,concrete->seqNo);
else
idStr = DFFormatString("%s%u-%s",get->conv->idPrefix,concrete->seqNo,DFNodeName(concrete->doc->root));
DFSetAttribute(element,HTML_ID,idStr);
free(idStr);
}
return element;
}
DFNode *WordConverterGetConcrete(WordPutData *put, DFNode *abstract)
{
// Is the abstract node an element, and does it have an id that matches the prefix used for
// conversion? That is, does it look like it has a corresponding node in the concrete document?
if ((abstract == NULL) || (abstract->tag < MIN_ELEMENT_TAG))
return NULL;;
const char *idStr = DFGetAttribute(abstract,HTML_ID);
if ((idStr == NULL) || !DFStringHasPrefix(idStr,put->conv->idPrefix))
return NULL;;
// Determine the node sequence number and the document based on the id attribute.
// The format of the attribute is <prefix><seqno>(-<docname>)?, where
//
// <prefix> is the BDT prefix we use to identify nodes that match the original document
// <seqno> is an integer uniquely identifying a node in a given document
// <docname> is the name of the document, either footnotes or endnotes. If absent, it is
// the main content document (that is, document.xml)
//
// Note that the sequence number only makes sense within the context of a specific document. It
// is possible to have two different nodes in different documents that have the same sequence number.
// It is for this reason that the id string identifies both the node and the document.
size_t idLen = strlen(idStr);
size_t prefixLen = strlen(put->conv->idPrefix);
unsigned int seqNo = 0;
size_t pos = prefixLen;
while ((pos < idLen) && (idStr[pos] >= '0') && (idStr[pos] <= '9'))
seqNo = seqNo*10 + (idStr[pos++] - '0');
const char *docName = NULL;
if ((pos < idLen) && (idStr[pos] == '-')) {
pos++;
docName = &idStr[pos];
}
DFDocument *doc = NULL;
if (docName == NULL)
doc = put->conv->package->document;
else if (!strcmp(docName,"footnotes"))
doc = put->conv->package->footnotes;
else if (!strcmp(docName,"endnotes"))
doc = put->conv->package->endnotes;
else
return NULL;
// Check to see if we have a node in the concrete document matching that sequence number
DFNode *node = DFNodeForSeqNo(doc,seqNo);
// Only return the node if it's actually an element
if ((node == NULL) || (node->tag < MIN_ELEMENT_TAG))
return NULL;
return node;
}
int WordConverterGet(DFDocument *html, DFStorage *abstractStorage, WordPackage *package, const char *idPrefix, DFError **error)
{
if (package->document == NULL) {
DFErrorFormat(error,"document.xml not found");
return 0;
}
DFNode *wordDocument = DFChildWithTag(package->document->docNode,WORD_DOCUMENT);
if (wordDocument == NULL) {
DFErrorFormat(error,"word:document not found");
return 0;
}
int haveFields = Word_simplifyFields(package);
Word_mergeRuns(package);
WordConverter *converter = WordConverterNew(html,abstractStorage,package,idPrefix);
converter->haveFields = haveFields;
WordAddNbsps(converter->package->document);
WordFixLists(converter);
CSSSheetRelease(converter->styleSheet);
converter->styleSheet = WordParseStyles(converter);
WordObjectsCollapseBookmarks(converter->objects);
WordObjectsScan(converter->objects);
WordObjectsAnalyzeBookmarks(converter->objects,converter->styles);
WordGetData get;
get.conv = converter;
DFNode *abstract = WordDocumentLens.get(&get,wordDocument);
DFAppendChild(converter->html->docNode,abstract);
converter->html->root = abstract;
Word_postProcessHTMLDoc(converter);
HTMLAddExternalStyleSheet(converter->html,"reset.css");
char *cssText = CSSSheetCopyCSSText(converter->styleSheet);
HTMLAddInternalStyleSheet(converter->html,cssText);
free(cssText);
HTML_safeIndent(converter->html->docNode,0);
int ok = 1;
if (converter->warnings->len > 0) {
DFErrorFormat(error,"%s",converter->warnings->data);
ok = 0;
}
WordConverterFree(converter);
return ok;
}
static void buildListMapFromHTML(WordPutData *put, DFNode *node)
{
if (node->tag == HTML_P) {
const char *htmlId = DFGetAttribute(node,CONV_LISTNUM);
DFNode *conElem = (htmlId != NULL) ? WordConverterGetConcrete(put,node) : NULL;
DFNode *pPrElem = (conElem != NULL) ? DFChildWithTag(conElem,WORD_PPR) : NULL;
DFNode *numPrElem = (pPrElem != NULL) ? DFChildWithTag(pPrElem,WORD_NUMPR) : NULL;
DFNode *numIdElem = (numPrElem != NULL) ? DFChildWithTag(numPrElem,WORD_NUMID) : NULL;
const char *numId = (numIdElem != NULL) ? DFGetAttribute(numIdElem,WORD_VAL) : NULL;
if (numId != NULL) {
const char *existingHtmlId = DFHashTableLookup(put->htmlIdByNumId,numId);
const char *existingNumId = DFHashTableLookup(put->numIdByHtmlId,htmlId);
if ((existingHtmlId == NULL) && (existingNumId == NULL)) {
DFHashTableAdd(put->htmlIdByNumId,numId,htmlId);
DFHashTableAdd(put->numIdByHtmlId,htmlId,numId);
WordConcreteNum *num = WordNumberingConcreteWithId(put->conv->numbering,numId);
if (num != NULL)
num->referenceCount++;
}
}
}
for (DFNode *child = node->first; child != NULL; child = child->next)
buildListMapFromHTML(put,child);
}
static void updateListTypes(WordPutData *put)
{
const char **htmlIds = DFHashTableCopyKeys(put->numIdByHtmlId);
for (int i = 0; htmlIds[i]; i++) {
const char *htmlId = htmlIds[i];
const char *numId = DFHashTableLookup(put->numIdByHtmlId,htmlId);
WordConcreteNum *num = WordNumberingConcreteWithId(put->conv->numbering,numId);
if (num == NULL)
continue; // FIXME: remove entry from both maps so it is re-created
DFNode *listNode = DFNodeForSeqNo(put->conv->html,(unsigned int)atoi(htmlId));
assert(listNode != NULL);
const char *htmlType = DFGetAttribute(listNode,CONV_LISTTYPE);
const char *htmlIlvl = DFGetAttribute(listNode,CONV_ILVL);
WordNumLevel *level = WordConcreteNumGetLevel(num,atoi(htmlIlvl));
if (level == NULL)
continue; // FIXME: remove entry from both maps so it is re-created
const char *wordType = WordNumLevelToListStyleType(level);
if (!DFStringEquals(wordType,htmlType)) {
// Make a copy of numId, as it may be freed during the first call to DFHashTableRemove
char *numIdCopy = xstrdup(numId);
DFHashTableRemove(put->numIdByHtmlId,htmlId);
DFHashTableRemove(put->htmlIdByNumId,numIdCopy);
free(numIdCopy);
if (num->referenceCount == 1)
WordNumberingRemoveConcrete(put->conv->numbering,num);
}
}
free(htmlIds);
}
static void addMissingDefaultStyles(WordConverter *converter)
{
if (CSSSheetDefaultStyleForFamily(converter->styleSheet,StyleFamilyParagraph) == NULL) {
CSSStyle *style = CSSSheetLookupElement(converter->styleSheet,"p","Normal",1,0);
CSSSheetSetDefaultStyle(converter->styleSheet,style,StyleFamilyParagraph);
}
if (CSSSheetDefaultStyleForFamily(converter->styleSheet,StyleFamilyCharacter) == NULL) {
CSSStyle *style = CSSSheetLookupElement(converter->styleSheet,"span","DefaultParagraphFont",1,0);
CSSStyleSetDisplayName(style,"Default Paragraph Font");
CSSSheetSetDefaultStyle(converter->styleSheet,style,StyleFamilyCharacter);
}
if (CSSSheetDefaultStyleForFamily(converter->styleSheet,StyleFamilyTable) == NULL) {
CSSStyle *style = CSSSheetLookupElement(converter->styleSheet,"table","Normal_Table",1,0);
CSSStyleSetDisplayName(style,"Normal Table");
CSSPut(CSSStyleCell(style),"padding-left","5.4pt");
CSSPut(CSSStyleCell(style),"padding-right","5.4pt");
CSSPut(CSSStyleCell(style),"padding-top","0pt");
CSSPut(CSSStyleCell(style),"padding-bottom","0pt");
CSSSheetSetDefaultStyle(converter->styleSheet,style,StyleFamilyTable);
}
}
int WordConverterPut(DFDocument *html, DFStorage *abstractStorage, WordPackage *package, const char *idPrefix, DFError **error)
{
if (package->document == NULL) {
DFErrorFormat(error,"document.xml not found");
return 0;
}
DFNode *wordDocument = DFChildWithTag(package->document->docNode,WORD_DOCUMENT);
if (wordDocument == NULL) {
DFErrorFormat(error,"word:document not found");
return 0;
}
HTML_normalizeDocument(html);
HTML_pushDownInlineProperties(html->docNode);
WordConverter *converter = WordConverterNew(html,abstractStorage,package,idPrefix);
// FIXME: Need a more reliable way of telling whether this is a new document or not - it could be that the
// document already existed (with styles set up) but did not have any content
DFNode *wordBody = DFChildWithTag(wordDocument,WORD_BODY);
int creating = ((wordBody == NULL) || (wordBody->first == NULL));
converter->haveFields = Word_simplifyFields(converter->package);
Word_mergeRuns(converter->package);
assert(converter->package->styles);
CSSSheetRelease(converter->styleSheet);
converter->styleSheet = CSSSheetNew();
char *cssText = HTMLCopyCSSText(converter->html);
CSSSheetUpdateFromCSSText(converter->styleSheet,cssText);
free(cssText);
addMissingDefaultStyles(converter);
CSSEnsureReferencedStylesPresent(converter->html,converter->styleSheet);
if (creating)
CSSSetHTMLDefaults(converter->styleSheet);
CSSEnsureUnique(converter->styleSheet,converter->html,creating);
CSSStyle *pageStyle = CSSSheetLookupElement(converter->styleSheet,"@page",NULL,0,0);
CSSStyle *bodyStyle = CSSSheetLookupElement(converter->styleSheet,"body",NULL,1,0);
CSSProperties *page = (pageStyle != NULL) ? CSSPropertiesRetain(CSSStyleRule(pageStyle)) : CSSPropertiesNew();
CSSProperties *body = (bodyStyle != NULL) ? CSSPropertiesRetain(CSSStyleRule(bodyStyle)) : CSSPropertiesNew();
if (CSSGet(body,"margin-left") == NULL)
CSSPut(body,"margin-left","10%");
if (CSSGet(body,"margin-right") == NULL)
CSSPut(body,"margin-right","10%");
if (CSSGet(body,"margin-top") == NULL)
CSSPut(body,"margin-top","10%");
if (CSSGet(body,"margin-bottom") == NULL)
CSSPut(body,"margin-bottom","10%");
WordSectionUpdateFromCSSPage(converter->mainSection,page,body);
WordPutData put;
put.conv = converter;
put.contentDoc = converter->package->document;
put.numIdByHtmlId = DFHashTableNew((DFCopyFunction)xstrdup,free);
put.htmlIdByNumId = DFHashTableNew((DFCopyFunction)xstrdup,free);
// Make sure we update styles.xml from the CSS stylesheet *before* doing any conversion of the content,
// since the latter requires a full mapping of CSS selectors to styleIds to be in place.
WordUpdateStyles(converter,converter->styleSheet);
Word_preProcessHTMLDoc(converter,converter->html);
buildListMapFromHTML(&put,converter->html->docNode);
updateListTypes(&put);
WordBookmarks_removeCaptionBookmarks(converter->package->document);
WordObjectsCollapseBookmarks(converter->objects);
WordObjectsScan(converter->objects);
Word_setupBookmarkLinks(&put);
WordObjectsAnalyzeBookmarks(converter->objects,converter->styles);
WordDocumentLens.put(&put,converter->html->root,wordDocument);
WordObjectsExpandBookmarks(converter->objects);
WordRemoveNbsps(converter->package->document);
// Make sure the updateFields flag is set
Word_updateSettings(converter->package,converter->haveFields);
// Remove any abstract numbering definitions that are no longer referenced from concrete
// numbering definitions
WordNumberingRemoveUnusedAbstractNums(converter->numbering);
// Remove any relationships and images that have been removed from the HTML file and no longer
// have any other references pointing to them
WordGarbageCollect(converter->package);
CSSPropertiesRelease(page);
CSSPropertiesRelease(body);
DFHashTableRelease(put.numIdByHtmlId);
DFHashTableRelease(put.htmlIdByNumId);
int ok = 1;
if (converter->warnings->len > 0) {
DFErrorFormat(error,"%s",converter->warnings->data);
ok = 0;
}
WordConverterFree(converter);
return ok;
}
void WordConverterWarning(WordConverter *converter, const char *format, ...)
{
va_list ap;
va_start(ap,format);
DFBufferVFormat(converter->warnings,format,ap);
va_end(ap);
}
char *WordStyleIdForStyle(CSSStyle *style)
{
const char *selector = style->selector;
char *resStyleId = NULL;
if (!strcmp(selector,"table.Normal_Table"))
return xstrdup("TableNormal");
if (!strcmp(selector,"table.Table_Grid"))
return xstrdup("TableGrid");
if (!strcmp(selector,"span.Default_Paragraph_Font"))
return xstrdup("DefaultParagraphFont");
if (!strcmp(selector,"p.List_Paragraph"))
return xstrdup("ListParagraph");
int headingLevel = CSSSelectorHeadingLevel(selector);
if (headingLevel != 0) {
char *prefix = DFFormatString("heading_%d",headingLevel);
if ((style->className != NULL) && DFStringHasPrefix(style->className,prefix)) {
char *rest = DFSubstring(style->className,strlen(prefix),strlen(style->className));
char *result = DFFormatString("Heading%d%s",headingLevel,rest);
free(rest);
free(prefix);
return result;
}
free(prefix);
}
if (!strcmp(selector,"span.Heading1Char"))
return xstrdup("Heading1Char");
if (!strcmp(selector,"span.Heading2Char"))
return xstrdup("Heading2Char");
if (!strcmp(selector,"span.Heading3Char"))
return xstrdup("Heading3Char");
if (!strcmp(selector,"span.Heading4Char"))
return xstrdup("Heading4Char");
if (!strcmp(selector,"span.Heading5Char"))
return xstrdup("Heading5Char");
if (!strcmp(selector,"span.Heading6Char"))
return xstrdup("Heading6Char");
if (!strcmp(selector,"span.Heading7Char"))
return xstrdup("Heading7Char");
if (!strcmp(selector,"span.Heading8Char"))
return xstrdup("Heading8Char");
if (!strcmp(selector,"span.Heading9Char"))
return xstrdup("Heading9Char");
char *className = CSSSelectorCopyClassName(selector);
switch (CSSSelectorGetTag(selector)) {
case HTML_FIGURE: {
resStyleId = DFStrDup("Figure");
break;
}
case HTML_CAPTION: {
resStyleId = DFStrDup("Caption");
break;
}
case HTML_H1:
case HTML_H2:
case HTML_H3:
case HTML_H4:
case HTML_H5:
case HTML_H6: {
if ((className == NULL) || (strlen(className) == 0)) {
int level = CSSSelectorHeadingLevel(selector);
if ((level >= 1) && (level <= 6)) {
// FIXME: we shouldn't rely on the specific word "Heading" here - instead using the localised name
// FIXME: not covered by tests
resStyleId = DFFormatString("Heading%d",level);
}
}
else {
resStyleId = DFStrDup(className);
}
break;
}
case HTML_P:
resStyleId = DFStrDup(className);
break;
case HTML_SPAN:
resStyleId = DFStrDup(className);
break;
case HTML_TABLE:
resStyleId = DFStrDup(className);
break;
}
free(className);
if (resStyleId == NULL) {
// Note: selector here may start with . (i.e. applies to all elements)
// FIXME: not covered by tests
resStyleId = xstrdup(selector);
}
return resStyleId;
}
StyleFamily WordStyleFamilyForSelector(const char *selector)
{
switch (CSSSelectorGetTag(selector)) {
case HTML_FIGURE:
case HTML_CAPTION:
case HTML_H1:
case HTML_H2:
case HTML_H3:
case HTML_H4:
case HTML_H5:
case HTML_H6:
return StyleFamilyParagraph;
case HTML_P: {
char *className = CSSSelectorCopyClassName(selector);
StyleFamily family = (className != NULL) ? StyleFamilyParagraph : StyleFamilySpecial;
free(className);
return family;
}
case HTML_SPAN:
return StyleFamilyCharacter;
case HTML_TABLE:
return StyleFamilyTable;
default:
return StyleFamilySpecial;
}
}
void childrenToArray(DFNode *node, DFNode **children)
{
bzero(children,PREDEFINED_TAG_COUNT*sizeof(DFNode *));
for (DFNode *child = node->first; child != NULL; child = child->next) {
if ((child->tag >= MIN_ELEMENT_TAG) && (child->tag < PREDEFINED_TAG_COUNT))
children[child->tag] = child;
}
}
void replaceChildrenFromArray(DFNode *node, DFNode **children, Tag *tags)
{
while (node->first != NULL)
DFRemoveNode(node->first);
for (int i = 0; tags[i] != 0; i++) {
if (children[tags[i]])
DFAppendChild(node,children[tags[i]]);
}
}