blob: 27fdbe4eac69078753ef5cb70c147a1973b8bc7c [file] [log] [blame]
* Copyright (C) 2007 The University of Manchester
* Modifications to the initial code base are copyright of their
* respective authors, or their employers as appropriate.
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
package net.sf.taverna.t2.provenance.lineageservice;
import java.beans.ExceptionListener;
import java.beans.XMLEncoder;
import java.sql.Blob;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import javax.sql.rowset.serial.SerialBlob;
import net.sf.taverna.t2.provenance.item.InputDataProvenanceItem;
import net.sf.taverna.t2.provenance.item.IterationProvenanceItem;
import net.sf.taverna.t2.provenance.item.OutputDataProvenanceItem;
import net.sf.taverna.t2.provenance.item.ProvenanceItem;
import net.sf.taverna.t2.provenance.item.WorkflowProvenanceItem;
import net.sf.taverna.t2.provenance.lineageservice.utils.Arc;
import net.sf.taverna.t2.provenance.lineageservice.utils.NestedListNode;
import net.sf.taverna.t2.provenance.lineageservice.utils.ProcBinding;
import net.sf.taverna.t2.provenance.lineageservice.utils.ProvenanceUtils;
import net.sf.taverna.t2.provenance.lineageservice.utils.Var;
import net.sf.taverna.t2.provenance.lineageservice.utils.VarBinding;
import net.sf.taverna.t2.provenance.vocabulary.SharedVocabulary;
import net.sf.taverna.t2.workflowmodel.Dataflow;
import net.sf.taverna.t2.workflowmodel.DataflowInputPort;
import net.sf.taverna.t2.workflowmodel.DataflowOutputPort;
import net.sf.taverna.t2.workflowmodel.Datalink;
import net.sf.taverna.t2.workflowmodel.Processor;
import net.sf.taverna.t2.workflowmodel.ProcessorInputPort;
import net.sf.taverna.t2.workflowmodel.ProcessorOutputPort;
import net.sf.taverna.t2.workflowmodel.processor.activity.Activity;
import net.sf.taverna.t2.workflowmodel.serialization.xml.XMLSerializerRegistry;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.XMLOutputter;
* @author Paolo Missier
public class EventProcessor {
* A map of UUIDs of the originating processor to the ProcBinding object
* that contains its parameters
private Map<String, ProcBinding> procBindingMap = new ConcurrentHashMap<String, ProcBinding>();;
/** A map of child ids to their parents in the hierarchy of events:
* workflow -> process -> processor -> activity -> iteration
private Map<String, String> parentChildMap= new ConcurrentHashMap<String, String>();
private static Logger logger = Logger.getLogger(EventProcessor.class);
private static final String OUTPUT_CONTAINER_PROCESSOR = "_OUTPUT_";
private static final String INPUT_CONTAINER_PROCESSOR = "_INPUT_";
private static final String TEST_EVENTS_FOLDER = "/tmp/TEST-EVENTS";
private static final String DATAFLOW_PROCESSOR_TYPE = "net.sf.taverna.t2.activities.dataflow.DataflowActivity";
private int eventCnt = 0; // for events logging
private volatile boolean workflowStructureDone = false; // used to inhibit processing of multiple workflow events -- we only need the first
private int dataflowDepth = 0; // incremented when we recurse on a subflow, decremented on exit
private volatile String wfInstanceID = null; // unique run ID. set when we see the first event of type "process"
String topLevelDataflowName = null;
String topLevelDataflowID = null;
Map<String, String> wfNestingMap = new ConcurrentHashMap<String, String>();
// all input bindings are accumulated here so they can be "backpatched" (see backpatching() )
List<VarBinding> allInputVarBindings = Collections.synchronizedList(new ArrayList<VarBinding>());
// dedicated class for processing WorkflowData events which carry workflow output info
private WorkflowDataProcessor wfdp;
private ProvenanceWriter pw = null;
private ProvenanceQuery pq = null;
public EventProcessor() { }
* @param pw
* @throws SQLException
* @throws ClassNotFoundException
* @throws IllegalAccessException
* @throws InstantiationException
public EventProcessor(ProvenanceWriter pw, ProvenanceQuery pq, WorkflowDataProcessor wfdp)
throws InstantiationException, IllegalAccessException,
ClassNotFoundException, SQLException { = pw;
this.pq = pq;
this.wfdp = wfdp;
* this is the new version that makes use of the T2 deserializer
* populate static portion of the DB<br/>
* the static structure may already be in the DB -- this is detected as a duplicate top-level workflow ID.
* In this case, we skip this processing altogether
* @param content
* is a serialized dataflow (XML) -- this is parsed using the T2
* Deserializer
* @return the wfInstanceRef for this workflow structure
public String processWorkflowStructure(ProvenanceItem provenanceItem) {
// this flag is set to prevent processing of separate workflowProvenanceItems that describe nested workflows.
// the processing of all nested workflows is done as part of the very first workflowProvenanceItem that we receive,
// which is self-consistent. so we ignore all others
if (workflowStructureDone) { return null; }
// logger.debug("Workflow instance is: " + getWfInstanceID());
Dataflow df = null;
df = ((WorkflowProvenanceItem)provenanceItem).getDataflow();
workflowStructureDone = true;
topLevelDataflowName = df.getLocalName();
topLevelDataflowID = df.getInternalIdentier();
// check whether we already have this WF in the DB
List<String> wfNames = null;
try {
wfNames = pq.getAllWFnames();
} catch (SQLException e) {
logger.warn("Problem processing workflow structure", e);
if (wfNames != null && wfNames.contains(topLevelDataflowID)) { // already in the DB
//"workflow structure with ID "+topLevelDataflowID+" is in the DB -- clearing static portion");
// clearing the portion of the static DB that pertains to this specific WF.
// it is going to be rewritten right away in the rest of this method
// this is simpler to implement than selectively avoiding duplicate writes to the DB
try {
} catch (SQLException e) {
logger.warn("Can't clear static database for " + topLevelDataflowID, e);
} else {
//"new workflow structure with ID "+topLevelDataflowID);
// record the top level dataflow as a processor in the DB
try {
pw.addProcessor(topLevelDataflowName, DATAFLOW_PROCESSOR_TYPE, topLevelDataflowID, true); // true -> is top level
} catch (SQLException e) {
logger.warn("Can't add processor " + topLevelDataflowID, e);
//"top level wf name: "+topLevelDataflowName);
return processDataflowStructure(df, topLevelDataflowID, df.getLocalName()); // null: no external name given to top level dataflow
* note: this method can be called as part of a recursion on sub-workflows
* @param df
* @param dataflowID the UUID for the entire dataflow (may be a sub-dataflow)
* @param localName the external name of the dataflow. Null if this is top level, not null if a sub-dataflow
* @return the wfInstanceRef for this workflow structure
public String processDataflowStructure(Dataflow df, String dataflowID, String externalName) {
String localWfInstanceID = getWfInstanceID();
try {
List<Var> vars = new ArrayList<Var>();
// check whether we already have this WF in the DB
List<String> wfNames = null;
try {
wfNames = pq.getAllWFnames();
} catch (SQLException e) {
logger.warn("Problem processing dataflow structure for " + dataflowID, e);
if (wfNames != null && wfNames.contains(dataflowID)) { // already in the DB
//"workflow structure with ID "+dataflowID+" is in the DB -- clearing static portion");
// clearing the portion of the static DB that pertains to this specific WF.
// it is going to be rewritten right away in the rest of this method
// this is simpler to implement than selectively avoiding duplicate writes to the DB
} else {
// logger.warn("new workflow structure with ID "+dataflowID);
// //////
// add workflow ID -- this is NOT THE SAME AS the wfInstanceID
// /////
// this could be a nested workflow -- in this case, override its wfInstanceID with that of its parent
String parentDataflow;
if ((parentDataflow = wfNestingMap.get(dataflowID)) == null) {
Element serializeDataflow = XMLSerializerRegistry.getInstance().getSerializer().serializeDataflow(df);
String dataflowString = null;
try {
XMLOutputter outputter = new XMLOutputter();
StringWriter stringWriter = new StringWriter();
outputter.output(serializeDataflow, stringWriter);
dataflowString = stringWriter.toString();
} catch ( e) {
logger.error("Could not serialise dataflow", e);
Blob blob = new SerialBlob(dataflowString.getBytes("UTF-8"));
// this is a top level dataflow description
pw.addWFId(dataflowID, null, externalName, blob); // set its dataflowID with no parent
} else {
Element serializeDataflow = XMLSerializerRegistry.getInstance().getSerializer().serializeDataflow(df);
String dataflowString = null;
try {
XMLOutputter outputter = new XMLOutputter();
StringWriter stringWriter = new StringWriter();
outputter.output(serializeDataflow, stringWriter);
dataflowString = stringWriter.toString();
} catch ( e) {
logger.error("Could not serialise dataflow", e);
Blob blob = new SerialBlob(dataflowString.getBytes("UTF-8"));
// we are processing a nested workflow structure
logger.debug("dataflow "+dataflowID+" with external name "+externalName+" is nested within "+parentDataflow);
pw.addWFId(dataflowID, parentDataflow, externalName, blob); // set its dataflowID along with its parent
// override wfInstanceID to point to top level -- UNCOMMENTED PM 9/09 CHECK
localWfInstanceID = pq.getRuns(parentDataflow, null).get(0).getInstanceID();
// logger.debug("overriding nested WFRef "+getWfInstanceID()+" with parent WFRef "+localWfInstanceID);
pw.addWFInstanceId(dataflowID, localWfInstanceID); // wfInstanceID stripped by stripWfInstanceHeader() above
// //////
// add processors along with their variables
// /////
List<? extends Processor> processors = df.getProcessors();
for (Processor p : processors) {
//"adding processor "+p.getLocalName());
String pName = p.getLocalName();
//CHECK get type of first activity and set this as the type of the processor itself
List<? extends Activity<?>> activities = p.getActivityList();
String pType = null;
if (activities != null && !activities.isEmpty()) {
pType = activities.get(0).getClass().getCanonicalName();
pw.addProcessor(pName, pType, dataflowID, false); // false: not a top level processor
// ///
// add all input ports for this processor as input variables
// ///
List<? extends ProcessorInputPort> inputs = p.getInputPorts();
for (ProcessorInputPort ip : inputs) {
Var inputVar = new Var();
//"processDataflowStructure: adding input var "+pName+":"+ip.getName());
// ///
// add all output ports for this processor as output variables
// ///
List<? extends ProcessorOutputPort> outputs = p
for (ProcessorOutputPort op : outputs) {
Var outputVar = new Var();
// check for nested structures: if the activity is DataflowActivity
// then this processor is a nested workflow
// make an entry into wfNesting map with its ID and recurse on the nested workflow
for (Activity a:activities) {
if (a.getClass().getCanonicalName().contains("DataflowActivity" )) {
Dataflow nested = (Dataflow) a.getConfiguration();
//"RECURSION ON nested workflow: "+p.getLocalName()+" with id: "+nested.getInternalIdentier());
wfNestingMap.put(nested.getInternalIdentier(), dataflowID); // child -> parent
processDataflowStructure(nested, nested.getInternalIdentier(), p.getLocalName());
//List<? extends Processor> procs = nested.getProcessors();
// for (Processor nestedP:procs) {
// System.out.println("recursion on nested processor: "+nestedP.getLocalName());
// }
} // end for each processor
// ////
// add inputs to entire dataflow
// ////
String pName = INPUT_CONTAINER_PROCESSOR; // overridden -- see below
// check whether we are processing a nested workflow. in this case
// the input vars are not assigned to the INPUT processor but to the containing dataflow
if (externalName != null) { // override the default if we are nested or someone external name is provided
pName = externalName;
List<? extends DataflowInputPort> inputPorts = df.getInputPorts();
for (DataflowInputPort ip : inputPorts) {
Var inputVar = new Var();
inputVar.setInput(true); // CHECK PM modified 11/08 -- input vars are actually outputs of input processors...
// ////
// add outputs of entire dataflow
// ////
pName = OUTPUT_CONTAINER_PROCESSOR; // overridden -- see below
// check whether we are processing a nested workflow. in this case
// the output vars are not assigned to the OUTPUT processor but to the containing dataflow
if (externalName != null) { // we are nested
pName = externalName;
List<? extends DataflowOutputPort> outputPorts = df
for (DataflowOutputPort op : outputPorts) {
Var outputVar = new Var();
outputVar.setInput(false); // CHECK PM modified 11/08 -- output vars are actually outputs of output processors...
pw.addVariables(vars, dataflowID);
// ////
// add arc records using the dataflow links
// retrieving the processor names requires navigating from links to
// source/sink and from there to the processors
// ////
List<? extends Datalink> links = df.getLinks();
for (Datalink l : links) {
// TODO cover the case of arcs from an input and to an output to
// the entire dataflow
String sourcePname = null;
String sinkPname = null;
if (l.getSource() instanceof ProcessorOutputPort) {
sourcePname = ((ProcessorOutputPort) l.getSource())
} else {
// System.out.println("found link from dataflow input");
if (l.getSink() instanceof ProcessorInputPort) {
sinkPname = ((ProcessorInputPort) l.getSink())
} else {
// System.out.println("found link to dataflow output");
if (sourcePname != null && sinkPname != null) {
// System.out.println("adding regular internal arc");
pw.addArc(l.getSource().getName(), sourcePname, l.getSink()
.getName(), sinkPname, dataflowID);
} else if (sourcePname == null) {
// link is from dataflow input or subflow input
if (externalName != null) { // link from subflow input
sourcePname = externalName;
} else {
//Ian added this logic since there were some null sinkPnameRefs with merge ports
if (sinkPname == null) {
// link is to dataflow output
if (externalName != null) { // link from subflow input
sinkPname = externalName;
} else {
// System.out.println("adding arc from dataflow input");
sourcePname, l.getSink().getName(),
sinkPname, dataflowID);
} else if (sinkPname == null) {
// link is to dataflow output
if (externalName != null) { // link from subflow input
sinkPname = externalName;
} else {
//Ian added this bit at the same time as the null sinkPnameRef logic above - hope it is correct
if (sourcePname == null) {
// link is from dataflow input or subflow input
if (externalName != null) { // link from subflow input
sourcePname = externalName;
} else {
// System.out.println("adding arc to dataflow output");
pw.addArc(l.getSource().getName(), sourcePname, l.getSink()
.getName(), sinkPname,
//"completed processing dataflow " + dataflowID);
} catch (Exception e) {
logger.error("Problem processing provenance for dataflow", e);
// logger.debug("wfInstanceID at the end of processDataflowStructure: "+getWfInstanceID());
return dataflowID;
private Element stripWfInstanceHeader(String content) {
SAXBuilder b = new SAXBuilder();
Document d;
try {
d = (new StringReader(content));
// get identifier from <workflowItem> element
Element root = d.getRootElement();
Namespace ns = Namespace.getNamespace("");
Element workflowEl = root.getChild("workflow", ns);
return workflowEl;
} catch (JDOMException e) {
logger.warn("Problem stripping workflow instance header", e);
} catch (IOException e) {
logger.warn("Problem stripping workflow instance header", e);
return null;
* processes an elementary process execution event from T2. Collects info
* from events as they happen and sends them to the writer for processing
* when the iteration event is received. Uses the map of procBindings to
* process event id and the map of child ids to parent ids to ensure that
* the correct proc binding is used
* @param currentWorkflowID
* @param d
* @param context
public void processProcessEvent(ProvenanceItem provenanceItem, String currentWorkflowID) {
if (provenanceItem.getEventType().equals(SharedVocabulary.PROCESS_EVENT_TYPE)) {
String parentId = provenanceItem.getParentId(); // this is the workflowID
String identifier = provenanceItem.getIdentifier(); // use this as wfInstanceID if this is the top-level process
parentChildMap.put(identifier, parentId);
ProcBinding pb = new ProcBinding();
procBindingMap.put(identifier, pb);
} else if (provenanceItem.getEventType().equals(SharedVocabulary.PROCESSOR_EVENT_TYPE)) {
String identifier = provenanceItem.getIdentifier();
String parentId = provenanceItem.getParentId();
String processID = provenanceItem.getProcessId(); // this is the external process ID
// this has the weird form facade0:dataflowname:pname need to extract pname from here
String[] processName = processID.split(":");
procBindingMap.get(parentId).setPNameRef(processName[processName.length-1]); // 3rd component of composite name
parentChildMap.put(identifier, parentId);
} else if (provenanceItem.getEventType().equals(SharedVocabulary.ACTIVITY_EVENT_TYPE)) {
String identifier = provenanceItem.getIdentifier();
String parentId = provenanceItem.getParentId();
parentChildMap.put(identifier, parentId);
} else if (provenanceItem.getEventType().equals(SharedVocabulary.ITERATION_EVENT_TYPE)) {
// traverse up to root to retrieve ProcBinding that was created when we saw the process event
String iterationID = provenanceItem.getIdentifier();
String activityID = provenanceItem.getParentId();
String processorID = parentChildMap.get(activityID);
String processID = parentChildMap.get(processorID);
parentChildMap.put(iterationID, activityID);
parentChildMap.put(iterationID, activityID);
ProcBinding procBinding = procBindingMap.get(processID);
String itVector = extractIterationVector(ProvenanceUtils.iterationToString(((IterationProvenanceItem)provenanceItem).getIteration()));
InputDataProvenanceItem inputDataEl = ((IterationProvenanceItem)provenanceItem).getInputDataItem();
OutputDataProvenanceItem outputDataEl = ((IterationProvenanceItem)provenanceItem).getOutputDataItem();
processInput(inputDataEl, procBinding, currentWorkflowID);
processOutput(outputDataEl, procBinding, currentWorkflowID);
try {
} catch (SQLException e) {
logger.debug("provenance has duplicate processor binding -- skipping the insertion"); //, e);
} else if (provenanceItem.getEventType().equals(SharedVocabulary.END_WORKFLOW_EVENT_TYPE)) {
// use this event to do housekeeping on the input/output varbindings
if (dataflowDepth == 0) {
// process the outputs accumulated by WorkflowDataProcessor
getWfdp().processTrees(provenanceItem.getWorkflowId(), getWfInstanceID());
// PM changed 23/4/09
reconcileTopLevelOutputs(); // patchTopLevelOutputs
workflowStructureDone = false; // CHECK reset for next run...
} else if (provenanceItem.getEventType().equals(SharedVocabulary.WORKFLOW_DATA_EVENT_TYPE)) {
// give this event to a WorkflowDataProcessor object for pre-processing
// try {
// TODO may generate an exception when the data is an error CHECK
// } catch (NumberFormatException e) {
// logger.error(e);
// }
//"Received workflow data - not processing");
//FIXME not sure - needs to be stored somehow
} else if (provenanceItem.getEventType().equals((SharedVocabulary.ERROR_EVENT_TYPE))) {
//TODO process the error
} else {
// TODO broken, should we throw something here?
* fills in the VBs for the global inputs -- this removes the need for explicit events
* that account for these value bindings...
public void patchTopLevelnputs() {
// for each input I to topLevelDataflow:
// pick first outgoing arc with sink P:X
// copy value X to I -- this can be a collection, so copy everything
// get all global input vars
//"\n\n BACKPATCHING GLOBAL INPUTS with dataflowDepth = "+dataflowDepth+"*******\n");
List<Var> inputs=null;
try {
inputs = getPq().getInputVars(topLevelDataflowName, topLevelDataflowID, getWfInstanceID());
for (Var input:inputs) {
//"global input: "+input.getVName());
Map<String,String> queryConstraints = new HashMap<String,String>();
queryConstraints.put("sourceVarNameRef", input.getVName());
queryConstraints.put("sourcePNameRef", input.getPName());
List<Arc> outgoingArcs = getPq().getArcs(queryConstraints);
// any arc will do, use the first
String targetPname = outgoingArcs.get(0).getSinkPnameRef();
String targetVname = outgoingArcs.get(0).getSinkVarNameRef();
//"copying values from ["+targetPname+":"+targetVname+"] for instance ID: ["+wfInstanceID+"]");
queryConstraints.put("varNameRef", targetVname);
queryConstraints.put("V.pNameRef", targetPname);
queryConstraints.put("VB.wfInstanceRef", getWfInstanceID());
queryConstraints.put("V.wfInstanceRef", topLevelDataflowID);
List<VarBinding> VBs = getPq().getVarBindings(queryConstraints);
//"found the following VBs:");
for (VarBinding vb:VBs) {
// insert VarBinding back into VB with the global input varname
} catch (SQLException e) {
logger.warn("Patch top level inputs problem for provenance", e);
} catch (IndexOutOfBoundsException e) {
logger.error("Could not patch top level", e);
// PM added 23/4/09
* reconcile the top level outputs with the results from its immediate precedessors in the graph.<br/>
* various cases have to be considered: predecessors may include records that are not in the output,
* while the output may include nested list structures that are not in the precedessors. This method accounts
* for a 2-way reconciliation that considers all possible cases.<br/>
* at the end, outputs and their predecessors contain the same data.<p/>
* NOTE: if we assume that data values (URIs) are <em>always</em> unique then this is greatly simplified by just
* comparing two sets of value records by their URIs and reconciling them. But this is not the way it is done here
public void reconcileTopLevelOutputs() {
for each output O
for each variable V in predecessors(O)
fetch all VB records for O into list OValues
fetch all VB records for V into list Yalues
compare OValues and VValues:
it SHOULD be the case that OValues is a subset of YValues. Under this assumption:
for each vb in YValues:
- if there is a matching o in OValues then (vb may be missing collection information)
copy o to vb
if vb has no collection info && there is a matching tree node tn in OTree (use iteration index for the match) then
set vb to be in collection tb
copy vb to o
finally copy all Collection records for O in OTree -- catch duplicate errors
Map<String,String> queryConstraints = new HashMap<String,String>();
List<Var> outputs=null;
try {
outputs = pq.getOutputVars(topLevelDataflowName, topLevelDataflowID, null); // null InstanceID
// for each output O
for (Var output:outputs) {
// collect all VBs for O
// String oPName = output.getPName();
// String oVName = output.getVName();
// queryConstraints.put("varNameRef", oVName);
// queryConstraints.put("V.pNameRef", oPName);
// queryConstraints.put("VB.wfInstanceRef", wfInstanceID);
// queryConstraints.put("V.wfInstanceRef", topLevelDataflowID);
// List<VarBinding> OValues = pq.getVarBindings(queryConstraints);
// find all records for the immediate precedessor Y of O
queryConstraints.put("sinkVarNameRef", output.getVName());
queryConstraints.put("sinkPNameRef", output.getPName());
List<Arc> incomingArcs = pq.getArcs(queryConstraints);
// there can be only one -- but check that there is one!
if (incomingArcs.size()==0) continue;
String sourcePname = incomingArcs.get(0).getSourcePnameRef();
String sourceVname = incomingArcs.get(0).getSourceVarNameRef();
queryConstraints.put("varNameRef", sourceVname);
queryConstraints.put("V.pNameRef", sourcePname);
queryConstraints.put("VB.wfInstanceRef", getWfInstanceID());
queryConstraints.put("V.wfInstanceRef", topLevelDataflowID);
List<VarBinding> YValues = pq.getVarBindings(queryConstraints);
// for each YValue look for a match in OValues
// (assume the YValues values are a superset of OValues)!)
for (VarBinding yValue:YValues) {
// System.out.println("reconcileTopLevelOutputs:: processing "+
// yValue.getPNameRef()+"/"+yValue.getVarNameRef()+"/"+yValue.getValue()+
// " with collid "+yValue.getCollIDRef());
// look for a matching record in VarBinding for output O
queryConstraints.put("varNameRef", output.getVName());
queryConstraints.put("V.pNameRef", output.getPName());
queryConstraints.put("VB.wfInstanceRef", getWfInstanceID());
queryConstraints.put("V.wfInstanceRef", topLevelDataflowID);
queryConstraints.put("VB.iteration", yValue.getIteration());
if (yValue.getCollIDRef()!= null) {
queryConstraints.put("VB.collIDRef", yValue.getCollIDRef());
queryConstraints.put("VB.positionInColl", Integer.toString(yValue.getPositionInColl()));
List<VarBinding> matchingOValues = pq.getVarBindings(queryConstraints);
// System.out.println("querying for matching oValues: ");
// result at most size 1
if (matchingOValues.size() > 0) {
VarBinding oValue = matchingOValues.get(0);
// System.out.println("found "+oValue.getPNameRef()+"/"+oValue.getVarNameRef()+"/"+oValue.getValue()+
// " with collid "+oValue.getCollIDRef());
// copy collection info from oValue to yValue
// System.out.println("oValue copied to yValue");
} else {
// System.out.println("no match found");
// copy the yValue to O
// insert VarBinding back into VB with the global output varname
} // for each yValue in YValues
// copy all Collection records for O to Y
// get all collections refs for O
queryConstraints.put("wfInstanceRef", getWfInstanceID());
queryConstraints.put("PNameRef", output.getPName());
queryConstraints.put("varNameRef", output.getVName());
List<NestedListNode> oCollections = pq.getNestedListNodes(queryConstraints);
// insert back as collection refs for Y -- catch duplicates
for (NestedListNode nln:oCollections) {
// System.out.println("collection: "+nln.getCollId());
getPw().replaceCollectionRecord(nln, sourcePname, sourceVname);
} // for each output var
} catch (SQLException e) {
logger.warn("Problem reconciling top level outputs", e);
private void processOutput(OutputDataProvenanceItem provenanceItem, ProcBinding procBinding, String currentWorkflowID) {
Element dataItemAsXML = ProvenanceUtils.getDataItemAsXML(provenanceItem);
List<Element> outputPorts = dataItemAsXML.getChildren("port");
for (Element outputport : outputPorts) {
String portName = outputport.getAttributeValue("name");
// value type may vary
List<Element> valueElements = outputport.getChildren();
if (valueElements != null && valueElements.size() > 0) {
Element valueEl = valueElements.get(0); // only really 1 child
processVarBinding(valueEl, procBinding.getPNameRef(), portName, procBinding.getIterationVector(),
getWfInstanceID(), currentWorkflowID);
* this method reconciles values in varBindings across an arc: Firstly, if vb's value is within a collection,
* _and_ it is copied from a value generated during a previous iteration,
* then this method propagates the list reference to that iteration value, which wouldn't have it.
* Conversely, if vb is going to be input to an iteration, then it's lost its containing list node, and we
* put it back in by looking at the corresponding predecessor
* @param vb
* @throws SQLException
private void backpatchIterationResults(List<VarBinding> newBindings) throws SQLException {
logger.debug("backpatchIterationResults: start");
for (VarBinding vb:newBindings) {
logger.debug("backpatchIterationResults: processing vb "+vb.getPNameRef()+"/"+vb.getVarNameRef()+"="+vb.getValue());
if (vb.getCollIDRef()!= null) { // this is a member of a collection
logger.debug("...which is inside a collection ");
// look for its antecedent
Map<String,String> queryConstraints = new HashMap<String,String>();
queryConstraints.put("sinkVarNameRef", vb.getVarNameRef());
queryConstraints.put("sinkPNameRef", vb.getPNameRef());
queryConstraints.put("wfInstanceRef", pq.getWfNames(vb.getWfInstanceRef()).get(0)); // CHECK picking first element in list...
List<Arc> incomingArcs = pq.getArcs(queryConstraints);
// there can be only one -- but check that there is one!
if (incomingArcs.size()==0) return;
String sourcePname = incomingArcs.get(0).getSourcePnameRef();
String sourceVname = incomingArcs.get(0).getSourceVarNameRef();
logger.debug("antecedent: "+sourcePname+":"+sourceVname);
// get the varbindings for this port and select the one with the same iteration vector as its successor
queryConstraints.put("varNameRef", sourceVname);
queryConstraints.put("V.pNameRef", sourcePname);
queryConstraints.put("VB.value", vb.getValue());
queryConstraints.put("VB.wfInstanceRef", vb.getWfInstanceRef());
List<VarBinding> VBs = pq.getVarBindings(queryConstraints);
if (VBs.size() == 0) { logger.debug("nothing to reconcile"); }
// reconcile
for (VarBinding b:VBs) {
logger.debug("backpatching "+sourceVname+" "+sourcePname);
if (vb.getCollIDRef() != null && b.getCollIDRef() == null) {
logger.debug("successor "+vb.getVarNameRef()+" is in collection "+vb.getCollIDRef()+" but pred "+b.getVarNameRef()+" is not");
logger.debug("putting "+b.getVarNameRef()+" in collection "+vb.getCollIDRef()+" at pos "+vb.getPositionInColl());
} else if (vb.getCollIDRef() == null && b.getCollIDRef() != null) {
logger.debug("successor "+vb.getVarNameRef()+" is NOT in collection but pred "+b.getVarNameRef()+" IS");
logger.debug("putting "+vb.getVarNameRef()+" in collection "+b.getCollIDRef()+" at pos "+b.getPositionInColl());
* create one new VarBinding record for each input port binding
* @param currentWorkflowID
private void processInput(InputDataProvenanceItem provenanceItem, ProcBinding procBinding, String currentWorkflowID) {
Element dataItemAsXML = ProvenanceUtils.getDataItemAsXML(provenanceItem);
List<Element> inputPorts = dataItemAsXML.getChildren("port");
int order = 0;
for (Element inputport : inputPorts) {
String portName = inputport.getAttributeValue("name");
//"processInput: processing VarBinding for "+procBinding.getPNameRef()+" "+portName);
try {
// add process order sequence to Var for this portName
Map<String, String> queryConstraints = new HashMap<String, String>();
queryConstraints.put("wfInstanceRef", currentWorkflowID);
queryConstraints.put("pnameRef", procBinding.getPNameRef());
queryConstraints.put("varName", portName);
queryConstraints.put("inputOrOutput", "1");
List<Var> vars = getPq().getVars(queryConstraints);
try {
Var v = vars.get(0);
catch (IndexOutOfBoundsException e) {
logger.error("Could not process input " + portName, e);
} catch (SQLException e1) {
logger.error("Could not process input " + portName, e1);
// value type may vary
List<Element> valueElements = inputport.getChildren(); // hopefully
// in the
// right
// order...
if (valueElements != null && valueElements.size() > 0) {
Element valueEl = valueElements.get(0); // expect only 1 child
// processVarBinding(valueEl, processor, portName, iterationVector,
// dataflow);
List<VarBinding> newBindings =
processVarBinding(valueEl, procBinding.getPNameRef(), portName, procBinding.getIterationVector(),
getWfInstanceID(), currentWorkflowID);
// this is a list whenever valueEl is of type list: in this case processVarBinding recursively
// processes all values within the collection, and generates one VarBinding record for each of them
// logger.debug("newBindings now has "+newBindings.size()+" elements");
// // if the new binding involves list values, then check to see if they need to be propagated back to
// // results of iterations
try {
} catch (SQLException e) {
logger.warn("Problem with back patching iteration results", e);
} else {
if (valueElements != null) logger.debug("port name "+portName+" "+valueElements.size());
else logger.debug("valueElements is null for port name "+portName);
* capture the default case where the value is not a list
* @param valueEl
* @param processorId
* @param portName
* @param iterationId
* @param wfInstanceRef
* @param currentWorkflowID
private List<VarBinding> processVarBinding(Element valueEl, String processorId,
String portName, String iterationId, String wfInstanceRef, String currentWorkflowID) {
// uses the defaults:
// collIdRef = null
// parentcollectionRef = null
// positionInCollection = 1
return processVarBinding(valueEl, processorId, portName, null, 1, null,
iterationId, wfInstanceRef, null, currentWorkflowID);
* general case where value can be a list
* @param valueEl
* @param processorId
* @param portName
* @param collIdRef
* @param positionInCollection
* @param parentCollectionRef
* @param iterationId
* @param wfInstanceRef
* @param currentWorkflowID
private List<VarBinding> processVarBinding(Element valueEl, String processorId,
String portName, String collIdRef, int positionInCollection,
String parentCollectionRef, String iterationId, String wfInstanceRef, String itVector, String currentWorkflowID) {
List<VarBinding> newBindings = new ArrayList<VarBinding>();
String valueType = valueEl.getName();
//"value element for " + processorId + ": "
// + valueType);
String iterationVector = null;
if (itVector == null)
iterationVector = extractIterationVector(iterationId);
else iterationVector = itVector;
VarBinding vb = new VarBinding();
if (valueType.equals("literal")) {
// logger.warn("input of type literal");
try {
logger.debug("new input VB with wfNameRef="+currentWorkflowID+" processorId="+processorId+
" valueType="+valueType+" portName="+portName+" collIdRef="+collIdRef+
" position="+positionInCollection+" itvector="+iterationVector+
" value="+vb.getValue());
//"calling addVarBinding on "+vb.getPNameRef()+" : "+vb.getVarNameRef());
} catch (SQLException e) {
logger.warn("Process Var Binding problem with provenance", e);
} else if (valueType.equals("referenceSet")) {
logger.debug("new input VB with wfNameRef="+currentWorkflowID+" processorId="+processorId+
" valueType="+valueType+" portName="+portName+" collIdRef="+collIdRef+
" position="+positionInCollection+" itvector="+iterationVector+
" value="+vb.getValue());
try {
// logger.debug("calling addVarBinding on "+vb.getPNameRef()+" : "+vb.getVarNameRef()+" with it "+vb.getIteration());
} catch (SQLException e) {
logger.debug("Problem processing var binding -- performing update instead of insert", e); //, e);
// try to update the existing record instead using the current collection info
// logger.warn("VarBinding update successful");
} else if (valueType.equals("list")) {
logger.debug("input of type list");
// add entries to the Collection and to the VarBinding tables
// list id --> Collection.collId
String collId = valueEl.getAttributeValue("id");
try {
parentCollectionRef = getPw().addCollection(processorId, collId,
parentCollectionRef, iterationVector, portName,
// iterate over each list element
List<Element> listElements = valueEl.getChildren();
positionInCollection = 1; // also use this as a suffix to extend the iteration vector
// extend iteration vector to account for additional levels within the list
String originalIterationVector = iterationVector;
// children can be any base type, including list itself -- so
// use recursion
for (Element el : listElements) {
if (originalIterationVector.length() >2) { // vector is not empty
iterationVector = originalIterationVector.substring(0,
originalIterationVector.length()-1) + ","+
Integer.toString(positionInCollection-1) + "]";
} else {
iterationVector = "["+ Integer.toString(positionInCollection-1) + "]";
List<VarBinding> bindings = processVarBinding(el, processorId, portName, collId,
positionInCollection, parentCollectionRef,
iterationId, wfInstanceRef, iterationVector, currentWorkflowID);
} catch (SQLException e) {
logger.warn("Problem processing var binding", e);
} else if (valueType.equals("error")) {
try {
} catch (SQLException e) {
logger.warn("Process Var Binding problem with provenance", e);
} else {
logger.warn("unrecognized value type element for "
+ processorId + ": " + valueType);
return newBindings;
* OBSOLETE: returns the iteration vector x,y,z,... from [x,y,z,...]
* <p/>
* now returns the vector itself -- this is still experimental
* @param iteration
* @return
String extractIterationVector(String iteration) {
return iteration;
// return iteration.substring(1, iteration.length() - 1);
// iteration is of the form "[n]" so we extract n
// String iterationN = iteration.substring(1, iteration.length()-1);
// if (iterationN.length() == 0) return 0;
// return Integer.parseInt(iterationN);
* log raw event to file system
* @param content
* @param eventType
* @throws IOException
public void saveEvent(ProvenanceItem provenanceItem, SharedVocabulary eventType) throws IOException {
// HACK -- XMLEncoder fails on IterationEvents and there is no way to catch the exception...
// so avoid this case
if (eventType.equals(SharedVocabulary.ITERATION_EVENT_TYPE)) {
// System.out.println("saveEvent: start");
File f1 = null;
f1 = new File(TEST_EVENTS_FOLDER);
String fname = "event_" + eventCnt++ + "_" + eventType + ".xml";
File f = new File(f1, fname);
// FileWriter fw = new FileWriter(f);
XMLEncoder en = new XMLEncoder(new BufferedOutputStream(
new FileOutputStream(f)));
en.setExceptionListener(new ExceptionListener()
public void exceptionThrown(Exception e)
logger.warn("XML encoding ERROR", e);
logger.debug("saving to " + f); // save event for later inspection
logger.debug("writer ok");
// fw.write(content);
// fw.flush();
// fw.close();
// FileWriter fw = new FileWriter(f);
// fw.write(content);
// fw.flush();
// fw.close();
// System.out.println("saved as file " + fname);
* for each arc of the form (_INPUT_/I, P/V): propagate VarBinding for P/V
* to var _INPUT_/I <br/>
* @throws SQLException
public void fillInputVarBindings(Object context) throws SQLException {
// System.out.println("*** fillInputVarBindings: ***");
// retrieve appropriate arcs
Map<String, String> constraints = new HashMap<String, String>();
constraints.put("sourcePnameRef", "_INPUT_");
constraints.put("W.instanceID", getWfInstanceID());
List<Arc> arcs = getPq().getArcs(constraints);
// backpropagate VarBinding from the target var of the arc to the source
for (Arc aArc : arcs) {
//"propagating VarBinding from ["
// + aArc.getSinkPnameRef() + "/" + aArc.getSinkVarNameRef()
// + "] to input [" + aArc.getSourcePnameRef() + "/"
// + aArc.getSourceVarNameRef() + "]");
// get the varBinding for the arc sinks
Map<String, String> vbConstraints = new HashMap<String, String>();
vbConstraints.put("VB.PNameRef", aArc.getSinkPnameRef());
vbConstraints.put("VB.varNameRef", aArc.getSinkVarNameRef());
vbConstraints.put("VB.wfInstanceRef", getWfInstanceID());
List<VarBinding> vbList = getPq().getVarBindings(vbConstraints); // DB
for (VarBinding vb : vbList) {
// add a new VarBinding for the input
// all other attributes are the same --> CHECK!!
* for each arc of the form (P/V, _OUTPUT_/O): propagate VarBinding for P/V
* to var _OUTPUT_/O <br/>
* @throws SQLException
public void fillOutputVarBindings(Object context) throws SQLException {
//System.out.println("*** fillOutputVarBindings: ***");
// retrieve appropriate arcs
Map<String, String> constraints = new HashMap<String, String>();
constraints.put("sinkPnameRef", "_OUTPUT_");
constraints.put("wfInstanceRef", getWfInstanceID());
List<Arc> arcs = getPq().getArcs(constraints);
// fowd propagate VarBinding from the source var of the arc to the
// output
for (Arc aArc : arcs) {
//"fwd propagating VarBinding from ["
// + aArc.getSourcePnameRef() + "/"
// + aArc.getSourceVarNameRef() + "] to input ["
// + aArc.getSinkPnameRef() + "/" + aArc.getSinkVarNameRef()
// + "]");
// get the varBinding for the arc sinks
Map<String, String> vbConstraints = new HashMap<String, String>();
vbConstraints.put("VB.PNameRef", aArc.getSourcePnameRef());
vbConstraints.put("VB.varNameRef", aArc.getSourceVarNameRef());
vbConstraints.put("VB.wfInstanceRef", getWfInstanceID());
List<VarBinding> vbList = getPq().getVarBindings(vbConstraints); // DB
for (VarBinding vb : vbList) {
// add a new VarBinding for the input
getPw().addVarBinding(vb); // DB UPDATE
* silly class to hold pairs of strings. any better way??
* @author paolo
class Pair {
String v1, v2;
public Pair(String current, String wfNameRef) {
v1=current; v2=wfNameRef;
* @return the v1
public String getV1() {
return v1;
* @param v1 the v1 to set
public void setV1(String v1) {
this.v1 = v1;
* @return the v2
public String getV2() {
return v2;
* @param v2 the v2 to set
public void setV2(String v2) {
this.v2 = v2;
public List<Pair> toposort(String dataflowName, String wfInstanceId) throws SQLException {
// String wfNameRef = pq.getWfNameForDataflow(dataflowName, wfInstanceID);
String wfNameRef = pq.getWfNameForDataflow(dataflowName);
// fetch processors along with the count of their predecessors
Map<String, Integer> predecessorsCount = getPq().getPredecessorsCount(wfInstanceId);
Map<String, List<String>> successorsOf = new HashMap<String, List<String>>();
// List<String> procList = pq.getContainedProcessors(dataflowName, wfInstanceId);
List<String> procList = pq.getContainedProcessors(dataflowName);
// logger.debug("toposort on "+dataflowName);
// logger.debug("contained procs: ");
for (String s:procList) {
List<String> successors = getPq().getSuccProcessors(s, wfNameRef, wfInstanceId);
successorsOf.put(s, successors);
// logger.debug(s+" with "+predecessorsCount.get(s)+" predecessors and successors:");
// for (String s1:successors) { logger.debug(s1); }
List<Pair> sorted = tsort(procList, dataflowName, predecessorsCount, successorsOf, wfNameRef, wfInstanceId);
// logger.debug("tsort:");
// for (String p : sorted) { logger.debug(p); }
for (int i=0; i< sorted.size(); i++) {
String procName = sorted.get(i).getV1();
if (pq.isDataflow(procName) && !procName.equals(dataflowName)) { // handle weirdness: a dataflow is contained within itself..
// recurse on procName
// logger.debug("recursion on "+procName);
List<Pair> sortedSublist = toposort(procName, wfInstanceId);
// replace procName with sortedSublist in sorted
sorted.addAll(i, sortedSublist);
return sorted;
* @param procList
* @param predecessorsCount
* @param successorsOf
* @param wfInstanceId
* @return
* @throws SQLException
public List<Pair> tsort(List<String> procList,
String dataflowName,
Map<String, Integer> predecessorsCount,
Map<String, List<String>> successorsOf,
String wfNameRef, String wfInstanceId) throws SQLException {
List<Pair> L = new ArrayList<Pair>(); // holds sorted elements
List<String> Q = new ArrayList<String>(); // temp queue
Set<String> visited = new HashSet<String>();
// logger.debug("queue init with procList");
// init queue with procList processors that have no predecessors
for (String proc:procList) {
// logger.debug("dataflowName: "+dataflowName+" proc: "+proc);
if (predecessorsCount.get(proc) == null || predecessorsCount.get(proc) == 0 &&
!proc.equals(dataflowName)) {
// logger.debug(proc + " added to queue");
// } else
// logger.debug(proc+" not added to queue");
// logger.debug("queue has "+Q.size()+" elements");
while (!Q.isEmpty()) {
String current = Q.remove(0);
// logger.debug("extracted "+current+" and added to L");
L.add(new Pair(current,wfNameRef));
// for (String s:L) logger.debug(s);
List<String> successors = successorsOf.get(current);
// logger.debug("\n****successors of "+current);
if (successors == null) continue;
// reduce the number of predecessors to each of the successors by one
// NB we must traverse an additional arc through a nested workflow input if the successor is a dataflow!!
for (String succ : successors) {
// logger.debug(succ);
// decrease edge count for each successor processor
Integer cnt = predecessorsCount.get(succ);
predecessorsCount.put(succ, new Integer(cnt.intValue() - 1));
// logger.debug("now "+succ+" has "+predecessorsCount.get(succ)+" predecessors");
if (predecessorsCount.get(succ) == 0 && !succ.equals(dataflowName)) {
// logger.debug("adding "+succ+" to queue");
} // end loop on Q
return L;
public void propagateANL(String wfInstanceId) throws SQLException {
String top = pq.getTopLevelDataflowName(wfInstanceId);
// //////////////////////
// PHASE I: toposort the processors in the whole graph
// //////////////////////
List<Pair> sorted = toposort(top, wfInstanceId);
logger.debug("final sorted list of processors");
for (Pair p:sorted) { logger.debug(p.getV1()+" in wfnameRef "+p.getV2()); }
// //////////////////////
// PHASE II: traverse and set anl on each port
// //////////////////////
// logger.debug("***** STARTING ANL *****");
// // sorted processor names in L at this point
// // process them in order
for (Pair pnameInContext : sorted) {
// logger.debug("setting ANL for "+pnameInContext.getV1()+" input vars");
// // process pname's inputs -- set ANL to be the DNL if not set in prior steps
String pname = pnameInContext.getV1();
String wfNameRef = pnameInContext.getV2();
// logger.debug("processor "+pname);
List<Var> inputs = getPq().getInputVars(pname, wfNameRef, wfInstanceId); // null -> do not use instance (??) CHECK
// logger.debug(inputs.size()+" inputs for "+pnameInContext.getV1());
int totalANL = 0;
for (Var iv : inputs) {
if (iv.isANLset() == false) {
// logger.debug("var: "+iv.getVName()+" set at nominal level "+iv.getActualNestingLevel());
int delta_nl = iv.getActualNestingLevel() - iv.getTypeNestingLevel();
// if delta_nl < 0 then Taverna wraps the value into a list --> use dnl(X) in this case
if (delta_nl < 0 ) delta_nl = 0;// CHECK iv.getTypeNestingLevel();
// logger.debug("delta for "+iv.getVName()+" "+delta_nl);
totalANL += delta_nl;
// this should take care of the special case of the top level dataflow with inputs that have successors in the graph
// propagate this through all the links from this var
// List<Var> successors = getPq().getSuccVars(pname, iv.getVName(), wfInstanceId);
// logger.debug(successors.size()+ " successors for var "+iv.getVName());
// for (Var v : successors) {
// v.setActualNestingLevel(iv.getActualNestingLevel());
// v.setANLset(true);
// getPw().updateVar(v);
// }
// logger.debug("total anl: "+totalANL);
// logger.debug("now setting ANL for "+pname+" output vars");
// process pname's outputs -- set ANL based on the sum formula (see
// paper)
List<Var> outputs = getPq().getOutputVars(pname, wfNameRef, wfInstanceId);
for (Var ov : outputs) {
ov.setActualNestingLevel(ov.getTypeNestingLevel() + totalANL);
logger.debug("anl for "+pname+":"+ov.getVName()+" = "+(ov.getTypeNestingLevel() + totalANL));
// propagate this through all the links from this var
List<Var> successors = getPq().getSuccVars(pname, ov.getVName(), wfNameRef);
// logger.debug(successors.size()+ " successors for var "+ov.getVName());
for (Var v : successors) {
List<Var> toBeProcessed = new ArrayList<Var>();
if (pq.isDataflow(v.getPName()) && v.isInput()) { // this is the input to a nested workflow
// String tempWfNameRef = pq.getWfNameForDataflow(v.getPName(), wfInstanceId);
String tempWfNameRef = pq.getWfNameForDataflow(v.getPName());
List<Var> realSuccessors = getPq().getSuccVars(v.getPName(), v.getVName(), tempWfNameRef);
// logger.debug("realSuccessors size = "+realSuccessors.size());
} else if (pq.isDataflow(v.getPName()) && !v.isInput()) { // this is the output to a nested workflow
// String tempWfNameRef = pq.getWfNameForDataflow(v.getPName(), wfInstanceId);
String tempWfNameRef = pq.getWfNameForDataflow(v.getPName());
List<Var> realSuccessors = getPq().getSuccVars(v.getPName(), v.getVName(), null);
// logger.debug("realSuccessors size = "+realSuccessors.size());
for (Var v1:toBeProcessed) {
logger.debug("anl for "+v1.getPName()+":"+v1.getVName()+" = "+ov.getActualNestingLevel());
public void setPw(ProvenanceWriter pw) { = pw;
public ProvenanceWriter getPw() {
return pw;
public void setPq(ProvenanceQuery pq) {
this.pq = pq;
public ProvenanceQuery getPq() {
return pq;
public void setWfInstanceID(String wfInstanceID) {
this.wfInstanceID = wfInstanceID;
public String getWfInstanceID() {
return wfInstanceID;
public void setWfdp(WorkflowDataProcessor wfdp) {
this.wfdp = wfdp;
public WorkflowDataProcessor getWfdp() {
return wfdp;