solr/example/films/film_data_generator.py - lucene-solr - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 This will generate a movie data set of 1100 records.
 These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.
 Here is the link to the freebase page - https://www.freebase.com/film/film?schema=

 Usage - python3 film_data_generator.py
 """

 import csv
 import copy
 import json
 import codecs
 import datetime
 import urllib.parse
 import urllib.request
 import xml.etree.cElementTree as ET
 from xml.dom import minidom

 MAX_ITERATIONS=10  #10 limits it to 1100 docs

 # You need an API Key by Google to run this
 API_KEY = '<insert your Google developer API key>'
 service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
 query = [{
   "id": None,
   "name": None,
   "initial_release_date": None,
   "directed_by": [],
   "genre": [],
   "type": "/film/film",
   "initial_release_date>" : "2000"
 }]

 def gen_csv(filmlist):
   filmlistDup = copy.deepcopy(filmlist)
   #Convert multi-valued to % delimited string
   for film in filmlistDup:
       for key in film:
         if isinstance(film[key], list):
           film[key] = '|'.join(film[key])
   keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']
   with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:
     dict_writer = csv.DictWriter(csvfile, keys)
     dict_writer.writeheader()
     dict_writer.writerows(filmlistDup)

 def gen_json(filmlist):
   filmlistDup = copy.deepcopy(filmlist)
   with open('films.json', 'w') as jsonfile:
     jsonfile.write(json.dumps(filmlist, indent=2))

 def gen_xml(filmlist):
   root = ET.Element("add")
   for film in filmlist:
     doc = ET.SubElement(root, "doc")
     for key in film:
       if isinstance(film[key], list):
         for value in film[key]:
           field = ET.SubElement(doc, "field")
           field.set("name", key)
           field.text=value
       else:
         field = ET.SubElement(doc, "field")
         field.set("name", key)
         field.text=film[key]
   tree = ET.ElementTree(root)
   with open('films.xml', 'w') as f:
     f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent="  ") )

 def do_query(filmlist, cursor=""):
   params = {
           'query': json.dumps(query),
           'key': API_KEY,
           'cursor': cursor
   }
   url = service_url + '?' + urllib.parse.urlencode(params)
   data = urllib.request.urlopen(url).read().decode('utf-8')
   response = json.loads(data)
   for item in response['result']:
     del item['type'] # It's always /film/film. No point of adding this.
     try:
       datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")
     except ValueError:
       #Date time not formatted properly. Keeping it simple by removing the date field from that doc
       del item['initial_release_date']
     filmlist.append(item)
   return response.get("cursor")


 if __name__ == "__main__":
   filmlist = []
   cursor = do_query(filmlist)
   i=0
   while(cursor):
       cursor = do_query(filmlist, cursor)
       i = i+1
       if i==MAX_ITERATIONS:
           break

   gen_json(filmlist)
   gen_csv(filmlist)
   gen_xml(filmlist)
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	This will generate a movie data set of 1100 records.
	These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.
	Here is the link to the freebase page - https://www.freebase.com/film/film?schema=

	Usage - python3 film_data_generator.py
	"""

	import csv
	import copy
	import json
	import codecs
	import datetime
	import urllib.parse
	import urllib.request
	import xml.etree.cElementTree as ET
	from xml.dom import minidom

	MAX_ITERATIONS=10 #10 limits it to 1100 docs

	# You need an API Key by Google to run this
	API_KEY = '<insert your Google developer API key>'
	service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
	query = [{
	"id": None,
	"name": None,
	"initial_release_date": None,
	"directed_by": [],
	"genre": [],
	"type": "/film/film",
	"initial_release_date>" : "2000"
	}]

	def gen_csv(filmlist):
	filmlistDup = copy.deepcopy(filmlist)
	#Convert multi-valued to % delimited string
	for film in filmlistDup:
	for key in film:
	if isinstance(film[key], list):
	film[key] = '\|'.join(film[key])
	keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']
	with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:
	dict_writer = csv.DictWriter(csvfile, keys)
	dict_writer.writeheader()
	dict_writer.writerows(filmlistDup)

	def gen_json(filmlist):
	filmlistDup = copy.deepcopy(filmlist)
	with open('films.json', 'w') as jsonfile:
	jsonfile.write(json.dumps(filmlist, indent=2))

	def gen_xml(filmlist):
	root = ET.Element("add")
	for film in filmlist:
	doc = ET.SubElement(root, "doc")
	for key in film:
	if isinstance(film[key], list):
	for value in film[key]:
	field = ET.SubElement(doc, "field")
	field.set("name", key)
	field.text=value
	else:
	field = ET.SubElement(doc, "field")
	field.set("name", key)
	field.text=film[key]
	tree = ET.ElementTree(root)
	with open('films.xml', 'w') as f:
	f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") )

	def do_query(filmlist, cursor=""):
	params = {
	'query': json.dumps(query),
	'key': API_KEY,
	'cursor': cursor
	}
	url = service_url + '?' + urllib.parse.urlencode(params)
	data = urllib.request.urlopen(url).read().decode('utf-8')
	response = json.loads(data)
	for item in response['result']:
	del item['type'] # It's always /film/film. No point of adding this.
	try:
	datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")
	except ValueError:
	#Date time not formatted properly. Keeping it simple by removing the date field from that doc
	del item['initial_release_date']
	filmlist.append(item)
	return response.get("cursor")


	if __name__ == "__main__":
	filmlist = []
	cursor = do_query(filmlist)
	i=0
	while(cursor):
	cursor = do_query(filmlist, cursor)
	i = i+1
	if i==MAX_ITERATIONS:
	break

	gen_json(filmlist)
	gen_csv(filmlist)
	gen_xml(filmlist)