solr/example/files/conf/update-script.js - lucene-solr - Git at Google

 function get_class(name) {
   var clazz;
   try {
     // Java8 Nashorn
     clazz = eval("Java.type(name).class");
   } catch(e) {
     // Java7 Rhino
     clazz = eval("Packages."+name);
   }

   return clazz;
 }

 function processAdd(cmd) {

   doc = cmd.solrDoc;  // org.apache.solr.common.SolrInputDocument
   var id = doc.getFieldValue("id");
   logger.info("update-script#processAdd: id=" + id);

   // The idea here is to use the file's content_type value to
   // simplify into user-friendly values, such that types of, say, image/jpeg and image/tiff
   // are in an "Images" facet

   var ct = doc.getFieldValue("content_type");
   if (ct) {
     // strip off semicolon onward
     var semicolon_index = ct.indexOf(';');
     if (semicolon_index != -1) {
       ct = ct.substring(0,semicolon_index);
     }
     // and split type/subtype
     var ct_type = ct.substring(0,ct.indexOf('/'));
     var ct_subtype = ct.substring(ct.indexOf('/')+1);

     var doc_type;
     switch(true) {
       case /^application\/rtf/.test(ct) || /wordprocessing/.test(ct):
         doc_type = "doc";
         break;

       case /html/.test(ct):
         doc_type = "html";
         break;

       case /^image\/.*/.test(ct):
         doc_type = "image";
         break;

       case /presentation|powerpoint/.test(ct):
         doc_type = "presentation";
         break;

       case /spreadsheet|excel/.test(ct):
         doc_type = "spreadsheet";
         break;

       case /^application\/pdf/.test(ct):
         doc_type = "pdf";
         break;

       case /^text\/plain/.test(ct):
         doc_type = "text"
         break;

       default:
         break;
     }

     // TODO: error handling needed?   What if there is no slash?
     if(doc_type) { doc.setField("doc_type", doc_type); }
     doc.setField("content_type_type_s", ct_type);
     doc.setField("content_type_subtype_s", ct_subtype);
   }

   var content = doc.getFieldValue("content");
   if (!content) {
     return; //No content found, so we are done here
   }

     var analyzer =
          req.getCore().getLatestSchema()
          .getFieldTypeByName("text_email_url")
          .getIndexAnalyzer();

   var token_stream =
        analyzer.tokenStream("content", content);
   var term_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.CharTermAttribute"));
   var type_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.TypeAttribute"));
   token_stream.reset();
   while (token_stream.incrementToken()) {
     doc.addField(type_att.type().replace(/\<|\>/g,'').toLowerCase()+"_ss", term_att.toString());
   }
   token_stream.end();
   token_stream.close();
 }

 function processDelete(cmd) {
   // no-op
 }

 function processMergeIndexes(cmd) {
   // no-op
 }

 function processCommit(cmd) {
   // no-op
 }

 function processRollback(cmd) {
   // no-op
 }

 function finish() {
   // no-op
 }
	function get_class(name) {
	var clazz;
	try {
	// Java8 Nashorn
	clazz = eval("Java.type(name).class");
	} catch(e) {
	// Java7 Rhino
	clazz = eval("Packages."+name);
	}

	return clazz;
	}

	function processAdd(cmd) {

	doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument
	var id = doc.getFieldValue("id");
	logger.info("update-script#processAdd: id=" + id);

	// The idea here is to use the file's content_type value to
	// simplify into user-friendly values, such that types of, say, image/jpeg and image/tiff
	// are in an "Images" facet

	var ct = doc.getFieldValue("content_type");
	if (ct) {
	// strip off semicolon onward
	var semicolon_index = ct.indexOf(';');
	if (semicolon_index != -1) {
	ct = ct.substring(0,semicolon_index);
	}
	// and split type/subtype
	var ct_type = ct.substring(0,ct.indexOf('/'));
	var ct_subtype = ct.substring(ct.indexOf('/')+1);

	var doc_type;
	switch(true) {
	case /^application\/rtf/.test(ct) \|\| /wordprocessing/.test(ct):
	doc_type = "doc";
	break;

	case /html/.test(ct):
	doc_type = "html";
	break;

	case /^image\/.*/.test(ct):
	doc_type = "image";
	break;

	case /presentation\|powerpoint/.test(ct):
	doc_type = "presentation";
	break;

	case /spreadsheet\|excel/.test(ct):
	doc_type = "spreadsheet";
	break;

	case /^application\/pdf/.test(ct):
	doc_type = "pdf";
	break;

	case /^text\/plain/.test(ct):
	doc_type = "text"
	break;

	default:
	break;
	}

	// TODO: error handling needed? What if there is no slash?
	if(doc_type) { doc.setField("doc_type", doc_type); }
	doc.setField("content_type_type_s", ct_type);
	doc.setField("content_type_subtype_s", ct_subtype);
	}

	var content = doc.getFieldValue("content");
	if (!content) {
	return; //No content found, so we are done here
	}

	var analyzer =
	req.getCore().getLatestSchema()
	.getFieldTypeByName("text_email_url")
	.getIndexAnalyzer();

	var token_stream =
	analyzer.tokenStream("content", content);
	var term_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.CharTermAttribute"));
	var type_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.TypeAttribute"));
	token_stream.reset();
	while (token_stream.incrementToken()) {
	doc.addField(type_att.type().replace(/\<\|\>/g,'').toLowerCase()+"_ss", term_att.toString());
	}
	token_stream.end();
	token_stream.close();
	}

	function processDelete(cmd) {
	// no-op
	}

	function processMergeIndexes(cmd) {
	// no-op
	}

	function processCommit(cmd) {
	// no-op
	}

	function processRollback(cmd) {
	// no-op
	}

	function finish() {
	// no-op
	}