blob: bcb8c36a961c1e18d6186cf03473fdf02e40fd14 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.swf;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Stack;
import java.util.Vector;
import java.io.FileInputStream;
import java.io.IOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.conf.Configuration;
import com.anotherbigidea.flash.interfaces.SWFActionBlock;
import com.anotherbigidea.flash.interfaces.SWFActions;
import com.anotherbigidea.flash.interfaces.SWFText;
import com.anotherbigidea.flash.interfaces.SWFVectors;
import com.anotherbigidea.flash.readers.SWFReader;
import com.anotherbigidea.flash.readers.TagParser;
import com.anotherbigidea.flash.structs.AlphaColor;
import com.anotherbigidea.flash.structs.Color;
import com.anotherbigidea.flash.structs.Matrix;
import com.anotherbigidea.flash.structs.Rect;
import com.anotherbigidea.flash.writers.SWFActionBlockImpl;
import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
import com.anotherbigidea.io.InStream;
/**
* Parser for Flash SWF files. Loosely based on the sample in JavaSWF
* distribution.
*/
public class SWFParser implements Parser {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private Configuration conf = null;
public SWFParser() {
//default constructor
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public ParseResult getParse(Content content) {
String text = null;
Vector<Outlink> outlinks = new Vector<>();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+ " bytes. Parser can't handle incomplete files.")
.getEmptyParseResult(content.getUrl(), getConf());
}
ExtractText extractor = new ExtractText();
// TagParser implements SWFTags and drives a SWFTagTypes interface
TagParser parser = new TagParser(extractor);
// use this instead to debug the file
// TagParser parser = new TagParser( new SWFTagDumper(true, true) );
// SWFReader reads an input file and drives a SWFTags interface
SWFReader reader = new SWFReader(parser, new InStream(raw));
// read the input SWF file and pass it through the interface pipeline
reader.readFile();
text = extractor.getText();
String atext = extractor.getActionText();
if (atext != null && atext.length() > 0)
text += "\n--------\n" + atext;
// harvest potential outlinks
String[] links = extractor.getUrls();
for (int i = 0; i < links.length; i++) {
Outlink out = new Outlink(links[i], "");
outlinks.add(out);
}
Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
if (olinks != null)
for (int i = 0; i < olinks.length; i++) {
outlinks.add(olinks[i]);
}
} catch (Exception e) { // run time exception
LOG.error("Error, runtime exception: ", e);
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as SWF document. " + e).getEmptyParseResult(
content.getUrl(), getConf());
}
if (text == null)
text = "";
Outlink[] links = (Outlink[]) outlinks
.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
content.getMetadata());
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
parseData));
}
/**
* Arguments are: 0. Name of input SWF file.
*/
public static void main(String[] args) throws IOException {
FileInputStream in = new FileInputStream(args[0]);
byte[] buf = new byte[in.available()];
in.read(buf);
in.close();
SWFParser parser = new SWFParser();
ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
"file:" + args[0], buf, "application/x-shockwave-flash",
new Metadata(), NutchConfiguration.create()));
Parse p = parseResult.get("file:" + args[0]);
System.out.println("Parse Text:");
System.out.println(p.getText());
System.out.println("Parse Data:");
System.out.println(p.getData());
}
}
/**
* Shows how to parse a Flash movie and extract all the text in Text symbols and
* the initial text in Edit Fields. Output is to System.out.
*
* A "pipeline" is set up in the main method:
*
* SWFReader-->TagParser-->ExtractText
*
* SWFReader reads the input SWF file and separates out the header and the tags.
* The separated contents are passed to TagParser which parses out the
* individual tag types and passes them to ExtractText.
*
* ExtractText extends SWFTagTypesImpl and overrides some methods.
*/
class ExtractText extends SWFTagTypesImpl {
/**
* Store font info keyed by the font symbol id. Each entry is an int[] of
* character codes for the correspnding font glyphs (An empty array denotes a
* System Font).
*/
protected HashMap<Integer, int[]> fontCodes = new HashMap<>();
public ArrayList<String> strings = new ArrayList<>();
public HashSet<String> actionStrings = new HashSet<>();
public ArrayList<String> urls = new ArrayList<>();
public ExtractText() {
super(null);
}
public String getText() {
StringBuffer res = new StringBuffer();
Iterator<String> it = strings.iterator();
while (it.hasNext()) {
if (res.length() > 0)
res.append(' ');
res.append(it.next());
}
return res.toString();
}
public String getActionText() {
StringBuffer res = new StringBuffer();
String[] strings = (String[]) actionStrings
.toArray(new String[actionStrings.size()]);
Arrays.sort(strings);
for (int i = 0; i < strings.length; i++) {
if (i > 0)
res.append('\n');
res.append(strings[i]);
}
return res.toString();
}
public String[] getUrls() {
String[] res = new String[urls.size()];
int i = 0;
Iterator<String> it = urls.iterator();
while (it.hasNext()) {
res[i] = it.next();
i++;
}
return res;
}
public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
int arg4) throws IOException {
tagDefineFontInfo(arg0, arg1, arg2, arg3);
}
/**
* SWFTagTypes interface Save the Text Font character code info
*/
public void tagDefineFontInfo(int fontId, String fontName, int flags,
int[] codes) throws IOException {
// System.out.println("-defineFontInfo id=" + fontId + ", name=" +
// fontName);
fontCodes.put(Integer.valueOf(fontId), codes);
}
// XXX too much hassle for too little return ... we cannot guess character
// XXX codes anyway, so we just give up.
/*
* public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
* return null; }
*/
/**
* SWFTagTypes interface. Save the character code info.
*/
public SWFVectors tagDefineFont2(int id, int flags, String name,
int numGlyphs, int ascent, int descent, int leading, int[] codes,
int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
int[] kernAdjustments) throws IOException {
fontCodes.put(Integer.valueOf(id), (codes != null) ? codes : new int[0]);
return null;
}
/**
* SWFTagTypes interface. Dump any initial text in the field.
*/
public void tagDefineTextField(int fieldId, String fieldName,
String initialText, Rect boundary, int flags, AlphaColor textColor,
int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
int rightMargin, int indentation, int lineSpacing) throws IOException {
if (initialText != null) {
strings.add(initialText);
}
}
/**
* SWFTagTypes interface
*/
public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
throws IOException {
lastBounds = curBounds;
curBounds = bounds;
return new TextDumper();
}
Rect lastBounds = null;
Rect curBounds = null;
/**
* SWFTagTypes interface
*/
public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
throws IOException {
lastBounds = curBounds;
curBounds = bounds;
return new TextDumper();
}
public class TextDumper implements SWFText {
protected Integer fontId;
protected boolean firstY = true;
@Override
public void font(int fontId, int textHeight) {
this.fontId = fontId;
}
@Override
public void setY(int y) {
if (firstY)
firstY = false;
else
strings.add("\n"); // Change in Y - dump a new line
}
/*
* There are some issues with this method: sometimes SWF files define their
* own font, so short of OCR we cannot guess what is the glyph code ->
* character mapping. Additionally, some files don't use literal space
* character, instead they adjust glyphAdvances. We don't handle it at all -
* in such cases the text will be all glued together.
*/
@Override
public void text(int[] glyphIndices, int[] glyphAdvances) {
int[] codes = (int[]) fontCodes.get(fontId);
if (codes == null) {
// unknown font, better not guess
strings.add("\n**** ?????????????? ****\n");
return;
}
// --Translate the glyph indices to character codes
char[] chars = new char[glyphIndices.length];
for (int i = 0; i < chars.length; i++) {
int index = glyphIndices[i];
if (index >= codes.length) // System Font ?
{
chars[i] = (char) index;
} else {
chars[i] = (char) (codes[index]);
}
}
strings.add(new String(chars));
}
@Override
public void color(Color color) {
}
@Override
public void setX(int x) {
}
@Override
public void done() {
strings.add("\n");
}
}
@Override
public SWFActions tagDoAction() throws IOException {
return new NutchSWFActions(actionStrings, urls);
}
@Override
public SWFActions tagDoInitAction(int arg0) throws IOException {
return new NutchSWFActions(actionStrings, urls);
}
}
/**
* ActionScript parser. This parser tries to extract free text embedded inside
* the script, but without polluting it too much with names of variables,
* methods, etc. Not ideal, but it works.
*/
class NutchSWFActions extends SWFActionBlockImpl implements SWFActions {
private HashSet<String> strings = null;
private ArrayList<String> urls = null;
String[] dict = null;
Stack<Object> stack = null;
public NutchSWFActions(HashSet<String> strings, ArrayList<String> urls) {
this.strings = strings;
this.urls = urls;
stack = new SmallStack(100, strings);
}
@Override
public void lookupTable(String[] values) throws IOException {
for (int i = 0; i < values.length; i++) {
if (!strings.contains(values[i]))
strings.add(values[i]);
}
super.lookupTable(values);
dict = values;
}
@Override
public void defineLocal() throws IOException {
stack.pop();
super.defineLocal();
}
public void getURL(int vars, int mode) {
}
@Override
public void getURL(String url, String target) throws IOException {
stack.push(url);
stack.push(target);
strings.remove(url);
strings.remove(target);
urls.add(url);
super.getURL(url, target);
}
public SWFActionBlock.TryCatchFinally _try(String var) throws IOException {
strings.remove(var);
return super._try(var);
}
@Override
public void comment(String var) throws IOException {
strings.remove(var);
super.comment(var);
}
public void goToFrame(String var) throws IOException {
stack.push(var);
strings.remove(var);
super.gotoFrame(var);
}
public void ifJump(String var) throws IOException {
strings.remove(var);
super.ifJump(var);
}
public void jump(String var) throws IOException {
strings.remove(var);
super.jump(var);
}
public void jumpLabel(String var) throws IOException {
strings.remove(var);
super.jumpLabel(var);
}
public void lookup(int var) throws IOException {
if (dict != null && var >= 0 && var < dict.length) {
stack.push(dict[var]);
}
super.lookup(var);
}
public void push(String var) throws IOException {
stack.push(var);
strings.remove(var);
super.push(var);
}
public void setTarget(String var) throws IOException {
stack.push(var);
strings.remove(var);
super.setTarget(var);
}
public SWFActionBlock startFunction(String var, String[] params)
throws IOException {
stack.push(var);
strings.remove(var);
if (params != null) {
for (int i = 0; i < params.length; i++) {
strings.remove(params[i]);
}
}
return this;
}
public SWFActionBlock startFunction2(String var, int arg1, int arg2,
String[] params, int[] arg3) throws IOException {
stack.push(var);
strings.remove(var);
if (params != null) {
for (int i = 0; i < params.length; i++) {
strings.remove(params[i]);
}
}
return this;
}
public void waitForFrame(int num, String var) throws IOException {
stack.push(var);
strings.remove(var);
super.waitForFrame(num, var);
}
public void waitForFrame(String var) throws IOException {
stack.push(var);
strings.remove(var);
super.waitForFrame(var);
}
public void done() throws IOException {
while (stack.size() > 0) {
strings.remove(stack.pop());
}
}
public SWFActionBlock start(int arg0, int arg1) throws IOException {
return this;
}
public SWFActionBlock start(int arg0) throws IOException {
return this;
}
public void add() throws IOException {
super.add();
}
public void asciiToChar() throws IOException {
super.asciiToChar();
}
public void asciiToCharMB() throws IOException {
super.asciiToCharMB();
}
public void push(int var) throws IOException {
if (dict != null && var >= 0 && var < dict.length) {
stack.push(dict[var]);
}
super.push(var);
}
public void callFunction() throws IOException {
strings.remove(stack.pop());
super.callFunction();
}
public void callMethod() throws IOException {
strings.remove(stack.pop());
super.callMethod();
}
public void getMember() throws IOException {
// 0: name
String val = (String) stack.pop();
strings.remove(val);
super.getMember();
}
public void setMember() throws IOException {
// 0: value -1: name
stack.pop(); // value
String name = (String) stack.pop();
strings.remove(name);
super.setMember();
}
public void setProperty() throws IOException {
super.setProperty();
}
public void setVariable() throws IOException {
super.setVariable();
}
public void call() throws IOException {
strings.remove(stack.pop());
super.call();
}
public void setTarget() throws IOException {
strings.remove(stack.pop());
super.setTarget();
}
public void pop() throws IOException {
strings.remove(stack.pop());
super.pop();
}
public void push(boolean arg0) throws IOException {
stack.push("" + arg0);
super.push(arg0);
}
public void push(double arg0) throws IOException {
stack.push("" + arg0);
super.push(arg0);
}
public void push(float arg0) throws IOException {
stack.push("" + arg0);
super.push(arg0);
}
public void pushNull() throws IOException {
stack.push("");
super.pushNull();
}
public void pushRegister(int arg0) throws IOException {
stack.push("" + arg0);
super.pushRegister(arg0);
}
public void pushUndefined() throws IOException {
stack.push("???");
super.pushUndefined();
}
public void getProperty() throws IOException {
stack.pop();
super.getProperty();
}
public void getVariable() throws IOException {
strings.remove(stack.pop());
super.getVariable();
}
public void gotoFrame(boolean arg0) throws IOException {
stack.push("" + arg0);
super.gotoFrame(arg0);
}
public void gotoFrame(int arg0) throws IOException {
stack.push("" + arg0);
super.gotoFrame(arg0);
}
public void gotoFrame(String arg0) throws IOException {
stack.push("" + arg0);
strings.remove(arg0);
super.gotoFrame(arg0);
}
public void newObject() throws IOException {
stack.pop();
super.newObject();
}
public SWFActionBlock startWith() throws IOException {
return this;
}
}
/*
* Small bottom-less stack.
*/
class SmallStack extends Stack<Object> {
private static final long serialVersionUID = 1L;
private int maxSize;
private HashSet<String> strings = null;
public SmallStack(int maxSize, HashSet<String> strings) {
this.maxSize = maxSize;
this.strings = strings;
}
public Object push(Object o) {
// limit max size
if (this.size() > maxSize) {
String val = (String) remove(0);
strings.remove(val);
}
return super.push(o);
}
public Object pop() {
// tolerate underruns
if (this.size() == 0)
return null;
else
return super.pop();
}
}