scripts/staging/hmm/HMM.py - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 from bs4 import BeautifulSoup,SoupStrainer
 import nltk
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.stem import WordNetLemmatizer
 from nltk.corpus import stopwords
 import string
 import numpy as np
 import csv
 import os

 def create_dataset(input_file_name, output_file_name):
     f = open(input_file_name, 'r')
     data= f.read()
     lemmatizer = WordNetLemmatizer()
     soup = BeautifulSoup(data,'html.parser')
     sentences = []
     text_matrix = []
     for item in soup.findAll('body'):
         sentences = sent_tokenize(item.text)
         for sentence in sentences:
             text_matrix.append([token for token in word_tokenize(sentence) if token.lower() not in stopwords.words('english') and token not in string.punctuation])
     for i in range(len(text_matrix)):
         for j in range(len(text_matrix[i])):
             text_matrix[i][j] = lemmatizer.lemmatize(text_matrix[i][j].lower(), pos='v')
     length = max(map(len, text_matrix))
     text_matrix=np.array([row + [None] * (length - len(row)) for row in text_matrix])
     try:
         with open(output_file_name) as csvFile:
             os.remove(output_file_name)
             csvFile = open(output_file_name, 'w')
             for row in text_matrix:
                 writer = csv.writer(csvFile)
                 writer.writerow(row)
         csvFile.close()
     except IOError:
         with open(output_file_name, 'w') as csvFile:
             for row in text_matrix:
                     writer = csv.writer(csvFile)
                     writer.writerow(row)
             csvFile.close()

 create_dataset('reut2-000.sgm', 'text_matrix.csv')
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	from bs4 import BeautifulSoup,SoupStrainer
	import nltk
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords
	import string
	import numpy as np
	import csv
	import os

	def create_dataset(input_file_name, output_file_name):
	f = open(input_file_name, 'r')
	data= f.read()
	lemmatizer = WordNetLemmatizer()
	soup = BeautifulSoup(data,'html.parser')
	sentences = []
	text_matrix = []
	for item in soup.findAll('body'):
	sentences = sent_tokenize(item.text)
	for sentence in sentences:
	text_matrix.append([token for token in word_tokenize(sentence) if token.lower() not in stopwords.words('english') and token not in string.punctuation])
	for i in range(len(text_matrix)):
	for j in range(len(text_matrix[i])):
	text_matrix[i][j] = lemmatizer.lemmatize(text_matrix[i][j].lower(), pos='v')
	length = max(map(len, text_matrix))
	text_matrix=np.array([row + [None] * (length - len(row)) for row in text_matrix])
	try:
	with open(output_file_name) as csvFile:
	os.remove(output_file_name)
	csvFile = open(output_file_name, 'w')
	for row in text_matrix:
	writer = csv.writer(csvFile)
	writer.writerow(row)
	csvFile.close()
	except IOError:
	with open(output_file_name, 'w') as csvFile:
	for row in text_matrix:
	writer = csv.writer(csvFile)
	writer.writerow(row)
	csvFile.close()

	create_dataset('reut2-000.sgm', 'text_matrix.csv')