solr/contrib/ltr/example/libsvm_formatter.py - lucene-solr - Git at Google

 from subprocess import call
 import os

 PAIRWISE_THRESHOLD = 1.e-1
 FEATURE_DIFF_THRESHOLD = 1.e-6

 class LibSvmFormatter:
     def processQueryDocFeatureVector(self,docClickInfo,trainingFile):
         '''Expects as input a sorted by queries list or generator that provides the context
         for each query in a tuple composed of: (query , docId , relevance , source , featureVector).
         The list of documents that are part of the same query will generate comparisons
         against each other for training. '''
         with open(trainingFile,"w") as output:
             self.featureNameToId  = {}
             self.featureIdToName = {}
             self.curFeatIndex = 1;
             curListOfFv = []
             curQueryAndSource = ""
             for query,docId,relevance,source,featureVector in docClickInfo:
                 if curQueryAndSource != query + source:
                     #Time to flush out all the pairs
                     _writeRankSVMPairs(curListOfFv,output);
                     curListOfFv = []
                     curQueryAndSource = query + source
                 curListOfFv.append((relevance,self._makeFeaturesMap(featureVector)))
             _writeRankSVMPairs(curListOfFv,output); #This catches the last list of comparisons

     def _makeFeaturesMap(self,featureVector):
         '''expects a list of strings with "feature name":"feature value" pairs. Outputs a map of map[key] = value.
         Where key is now an integer. libSVM requires the key to be an integer but not all libraries have
         this requirement.'''
         features = {}
         for keyValuePairStr in featureVector:
             featName,featValue = keyValuePairStr.split("=");
             features[self._getFeatureId(featName)] = float(featValue);
         return features

     def _getFeatureId(self,key):
         if key not in self.featureNameToId:
                 self.featureNameToId[key] = self.curFeatIndex;
                 self.featureIdToName[self.curFeatIndex] = key;
                 self.curFeatIndex += 1;
         return self.featureNameToId[key];

     def convertLibSvmModelToLtrModel(self,libSvmModelLocation,outputFile,modelName,featureStoreName):
         with open(libSvmModelLocation, 'r') as inFile:
             with open(outputFile,'w') as convertedOutFile:
                 # TODO: use json module instead of direct write
                 convertedOutFile.write('{\n\t"class":"org.apache.solr.ltr.model.LinearModel",\n')
                 convertedOutFile.write('\t"store": "' + str(featureStoreName) + '",\n')
                 convertedOutFile.write('\t"name": "' + str(modelName) + '",\n')
                 convertedOutFile.write('\t"features": [\n')
                 isFirst = True;
                 for featKey in self.featureNameToId.keys():
                     convertedOutFile.write('\t\t{ "name":"' + featKey  + '"}' if isFirst else ',\n\t\t{ "name":"' + featKey  + '"}' );
                     isFirst = False;
                 convertedOutFile.write("\n\t],\n");
                 convertedOutFile.write('\t"params": {\n\t\t"weights": {\n');

                 startReading = False
                 isFirst = True
                 counter = 1
                 for line in inFile:
                     if startReading:
                         newParamVal = float(line.strip())
                         if not isFirst:
                             convertedOutFile.write(',\n\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
                         else:
                             convertedOutFile.write('\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
                             isFirst = False
                         counter += 1
                     elif line.strip() == 'w':
                         startReading = True
                 convertedOutFile.write('\n\t\t}\n\t}\n}')

 def _writeRankSVMPairs(listOfFeatures,output):
     '''Given a list of (relevance, {Features Map}) where the list represents
     a set of documents to be compared, this calculates all pairs and
     writes the Feature Vectors in a format compatible with libSVM.
     Ex: listOfFeatures = [
       #(relevance, {feature1:value, featureN:value})
       (4, {1:0.9, 2:0.9, 3:0.1})
       (3, {1:0.7, 2:0.9, 3:0.2})
       (1, {1:0.1, 2:0.9, 6:0.1})
     ]
     '''
     for d1 in range(0,len(listOfFeatures)):
         for d2 in range(d1+1,len(listOfFeatures)):
             doc1,doc2 = listOfFeatures[d1], listOfFeatures[d2]
             fv1,fv2 = doc1[1],doc2[1]
             d1Relevance, d2Relevance = float(doc1[0]),float(doc2[0])
             if  d1Relevance - d2Relevance > PAIRWISE_THRESHOLD:#d1Relevance > d2Relevance
                 outputLibSvmLine("+1",subtractFvMap(fv1,fv2),output);
                 outputLibSvmLine("-1",subtractFvMap(fv2,fv1),output);
             elif d1Relevance - d2Relevance < -PAIRWISE_THRESHOLD: #d1Relevance < d2Relevance:
                 outputLibSvmLine("+1",subtractFvMap(fv2,fv1),output);
                 outputLibSvmLine("-1",subtractFvMap(fv1,fv2),output);
             else: #Must be approximately equal relevance, in which case this is a useless signal and we should skip
                 continue;

 def subtractFvMap(fv1,fv2):
     '''returns the fv from fv1 - fv2'''
     retFv = fv1.copy();
     for featInd in fv2.keys():
         subVal = 0.0;
         if featInd in fv1:
             subVal = fv1[featInd] - fv2[featInd]
         else:
             subVal = -fv2[featInd]
         if abs(subVal) > FEATURE_DIFF_THRESHOLD: #This ensures everything is in sparse format, and removes useless signals
             retFv[featInd] = subVal;
         else:
             retFv.pop(featInd, None)
     return retFv;

 def outputLibSvmLine(sign,fvMap,outputFile):
     outputFile.write(sign)
     for feat in fvMap.keys():
         outputFile.write(" " + str(feat) + ":" + str(fvMap[feat]));
     outputFile.write("\n")

 def trainLibSvm(libraryLocation,libraryOptions,trainingFileName,trainedModelFileName):
     if os.path.isfile(libraryLocation):
         call([libraryLocation, libraryOptions, trainingFileName, trainedModelFileName])
     else:
         raise Exception("NO LIBRARY FOUND: " + libraryLocation);
	from subprocess import call
	import os

	PAIRWISE_THRESHOLD = 1.e-1
	FEATURE_DIFF_THRESHOLD = 1.e-6

	class LibSvmFormatter:
	def processQueryDocFeatureVector(self,docClickInfo,trainingFile):
	'''Expects as input a sorted by queries list or generator that provides the context
	for each query in a tuple composed of: (query , docId , relevance , source , featureVector).
	The list of documents that are part of the same query will generate comparisons
	against each other for training. '''
	with open(trainingFile,"w") as output:
	self.featureNameToId = {}
	self.featureIdToName = {}
	self.curFeatIndex = 1;
	curListOfFv = []
	curQueryAndSource = ""
	for query,docId,relevance,source,featureVector in docClickInfo:
	if curQueryAndSource != query + source:
	#Time to flush out all the pairs
	_writeRankSVMPairs(curListOfFv,output);
	curListOfFv = []
	curQueryAndSource = query + source
	curListOfFv.append((relevance,self._makeFeaturesMap(featureVector)))
	_writeRankSVMPairs(curListOfFv,output); #This catches the last list of comparisons

	def _makeFeaturesMap(self,featureVector):
	'''expects a list of strings with "feature name":"feature value" pairs. Outputs a map of map[key] = value.
	Where key is now an integer. libSVM requires the key to be an integer but not all libraries have
	this requirement.'''
	features = {}
	for keyValuePairStr in featureVector:
	featName,featValue = keyValuePairStr.split("=");
	features[self._getFeatureId(featName)] = float(featValue);
	return features

	def _getFeatureId(self,key):
	if key not in self.featureNameToId:
	self.featureNameToId[key] = self.curFeatIndex;
	self.featureIdToName[self.curFeatIndex] = key;
	self.curFeatIndex += 1;
	return self.featureNameToId[key];

	def convertLibSvmModelToLtrModel(self,libSvmModelLocation,outputFile,modelName,featureStoreName):
	with open(libSvmModelLocation, 'r') as inFile:
	with open(outputFile,'w') as convertedOutFile:
	# TODO: use json module instead of direct write
	convertedOutFile.write('{\n\t"class":"org.apache.solr.ltr.model.LinearModel",\n')
	convertedOutFile.write('\t"store": "' + str(featureStoreName) + '",\n')
	convertedOutFile.write('\t"name": "' + str(modelName) + '",\n')
	convertedOutFile.write('\t"features": [\n')
	isFirst = True;
	for featKey in self.featureNameToId.keys():
	convertedOutFile.write('\t\t{ "name":"' + featKey + '"}' if isFirst else ',\n\t\t{ "name":"' + featKey + '"}' );
	isFirst = False;
	convertedOutFile.write("\n\t],\n");
	convertedOutFile.write('\t"params": {\n\t\t"weights": {\n');

	startReading = False
	isFirst = True
	counter = 1
	for line in inFile:
	if startReading:
	newParamVal = float(line.strip())
	if not isFirst:
	convertedOutFile.write(',\n\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
	else:
	convertedOutFile.write('\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
	isFirst = False
	counter += 1
	elif line.strip() == 'w':
	startReading = True
	convertedOutFile.write('\n\t\t}\n\t}\n}')

	def _writeRankSVMPairs(listOfFeatures,output):
	'''Given a list of (relevance, {Features Map}) where the list represents
	a set of documents to be compared, this calculates all pairs and
	writes the Feature Vectors in a format compatible with libSVM.
	Ex: listOfFeatures = [
	#(relevance, {feature1:value, featureN:value})
	(4, {1:0.9, 2:0.9, 3:0.1})
	(3, {1:0.7, 2:0.9, 3:0.2})
	(1, {1:0.1, 2:0.9, 6:0.1})
	]
	'''
	for d1 in range(0,len(listOfFeatures)):
	for d2 in range(d1+1,len(listOfFeatures)):
	doc1,doc2 = listOfFeatures[d1], listOfFeatures[d2]
	fv1,fv2 = doc1[1],doc2[1]
	d1Relevance, d2Relevance = float(doc1[0]),float(doc2[0])
	if d1Relevance - d2Relevance > PAIRWISE_THRESHOLD:#d1Relevance > d2Relevance
	outputLibSvmLine("+1",subtractFvMap(fv1,fv2),output);
	outputLibSvmLine("-1",subtractFvMap(fv2,fv1),output);
	elif d1Relevance - d2Relevance < -PAIRWISE_THRESHOLD: #d1Relevance < d2Relevance:
	outputLibSvmLine("+1",subtractFvMap(fv2,fv1),output);
	outputLibSvmLine("-1",subtractFvMap(fv1,fv2),output);
	else: #Must be approximately equal relevance, in which case this is a useless signal and we should skip
	continue;

	def subtractFvMap(fv1,fv2):
	'''returns the fv from fv1 - fv2'''
	retFv = fv1.copy();
	for featInd in fv2.keys():
	subVal = 0.0;
	if featInd in fv1:
	subVal = fv1[featInd] - fv2[featInd]
	else:
	subVal = -fv2[featInd]
	if abs(subVal) > FEATURE_DIFF_THRESHOLD: #This ensures everything is in sparse format, and removes useless signals
	retFv[featInd] = subVal;
	else:
	retFv.pop(featInd, None)
	return retFv;

	def outputLibSvmLine(sign,fvMap,outputFile):
	outputFile.write(sign)
	for feat in fvMap.keys():
	outputFile.write(" " + str(feat) + ":" + str(fvMap[feat]));
	outputFile.write("\n")

	def trainLibSvm(libraryLocation,libraryOptions,trainingFileName,trainedModelFileName):
	if os.path.isfile(libraryLocation):
	call([libraryLocation, libraryOptions, trainingFileName, trainedModelFileName])
	else:
	raise Exception("NO LIBRARY FOUND: " + libraryLocation);