topic-web/tools/newsmlimporter.py - stanbol - Git at Google

 #!/usr/bin/env python
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Basic python script to load NewsML documents as training set

 Need Python 2.7 and lxml.

 TODO: port to Python 3 as well if not working by default.
 """
 from __future__ import print_function

 import os
 from time import time
 from lxml import html
 from lxml import etree
 from urllib import quote
 import urllib2
 from hashlib import sha1


 IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/"


 def find_text_and_subjects(newsml_content,
                            subject_tags=('SubjectMatter', 'SubjectDetail'),
                            text_tags=('HeadLine',),
                            html_tags=('body.content',)):
     # First parse of the document as XML for the structured attributes
     xtree = etree.ElementTree(etree.fromstring(newsml_content))
     text_items = [e.text.strip()
                   for tag in text_tags
                   for e in xtree.findall('//' + tag)]
     subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName')
                 for tag in subject_tags
                 for e in xtree.findall('//' + tag)]

     # Then use HTML parser to find the that looks like HTML hence can leverage
     # the text_content method.
     htree = etree.ElementTree(html.document_fromstring(newsml_content))

     text_items += [e.text_content().strip()
                    for tag in html_tags
                    for e in htree.findall('//' + tag)]
     text = "\n\n".join(text_items)
     return text, subjects


 def register_newsml_document(text, codes, url):
     id = sha1(text.encode('utf-8')).hexdigest()
     url += "?example_id=%s" % id
     for code in codes:
         url += "&concept=%s" % quote(code)
     request = urllib2.Request(url, data=text.encode('utf-8'))
     request.add_header('Content-Type', 'text/plain')
     opener = urllib2.build_opener()
     opener.open(request).read()


 def print_newsml_summary(text, codes, server_url=None):
     print(text.split('\n\n')[0])
     for code in codes:
         print('code: ' + code)
     print()


 if __name__ == "__main__":
     import sys

     # TODO: use argparse and debug switch to use print_newsfile_summary
     # instead of the default handler
     topfolder = sys.argv[1]
     max = int(sys.argv[2])
     server_url = sys.argv[3]
     handle_news = register_newsml_document

     count = 0
     previous = time()
     for dirpath, dirnames, filenames in os.walk(topfolder):
         if count >= max:
             break

         if '.svn' in dirnames:
             dirnames.remove('.svn')

         for filename in filenames:
             if count >= max:
                 break
             if not filename.endswith('.xml'):
                 continue
             full_path = os.path.join(topfolder, dirpath, filename)
             newsml_content = open(full_path, 'rb').read()
             text, codes = find_text_and_subjects(newsml_content)
             if len(codes) == 0:
                 # ignore document without subject info
                 continue
             handle_news(text, codes, server_url)
             count += 1
             if count % 100 == 0:
                 delta, previous = time() - previous, time()
                 print("Processed news %03d/%03d in %06.3fs"
                       % (count, max, delta))
	#!/usr/bin/env python
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Basic python script to load NewsML documents as training set

	Need Python 2.7 and lxml.

	TODO: port to Python 3 as well if not working by default.
	"""
	from __future__ import print_function

	import os
	from time import time
	from lxml import html
	from lxml import etree
	from urllib import quote
	import urllib2
	from hashlib import sha1


	IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/"


	def find_text_and_subjects(newsml_content,
	subject_tags=('SubjectMatter', 'SubjectDetail'),
	text_tags=('HeadLine',),
	html_tags=('body.content',)):
	# First parse of the document as XML for the structured attributes
	xtree = etree.ElementTree(etree.fromstring(newsml_content))
	text_items = [e.text.strip()
	for tag in text_tags
	for e in xtree.findall('//' + tag)]
	subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName')
	for tag in subject_tags
	for e in xtree.findall('//' + tag)]

	# Then use HTML parser to find the that looks like HTML hence can leverage
	# the text_content method.
	htree = etree.ElementTree(html.document_fromstring(newsml_content))

	text_items += [e.text_content().strip()
	for tag in html_tags
	for e in htree.findall('//' + tag)]
	text = "\n\n".join(text_items)
	return text, subjects


	def register_newsml_document(text, codes, url):
	id = sha1(text.encode('utf-8')).hexdigest()
	url += "?example_id=%s" % id
	for code in codes:
	url += "&concept=%s" % quote(code)
	request = urllib2.Request(url, data=text.encode('utf-8'))
	request.add_header('Content-Type', 'text/plain')
	opener = urllib2.build_opener()
	opener.open(request).read()


	def print_newsml_summary(text, codes, server_url=None):
	print(text.split('\n\n')[0])
	for code in codes:
	print('code: ' + code)
	print()


	if __name__ == "__main__":
	import sys

	# TODO: use argparse and debug switch to use print_newsfile_summary
	# instead of the default handler
	topfolder = sys.argv[1]
	max = int(sys.argv[2])
	server_url = sys.argv[3]
	handle_news = register_newsml_document

	count = 0
	previous = time()
	for dirpath, dirnames, filenames in os.walk(topfolder):
	if count >= max:
	break

	if '.svn' in dirnames:
	dirnames.remove('.svn')

	for filename in filenames:
	if count >= max:
	break
	if not filename.endswith('.xml'):
	continue
	full_path = os.path.join(topfolder, dirpath, filename)
	newsml_content = open(full_path, 'rb').read()
	text, codes = find_text_and_subjects(newsml_content)
	if len(codes) == 0:
	# ignore document without subject info
	continue
	handle_news(text, codes, server_url)
	count += 1
	if count % 100 == 0:
	delta, previous = time() - previous, time()
	print("Processed news %03d/%03d in %06.3fs"
	% (count, max, delta))