tests/nightly/TestDoc/doc_spell_checker.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 #pylint: disable=no-member, too-many-instance-attributes
 """This script uses pyenchant to check spelling for MXNet
     documentation website.
     An exclude list is provided to avoid checking specific word,
     such as NDArray.
 """
 from __future__ import print_function

 import os
 import sys
 import re
 from HTMLParser import HTMLParser
 import enchant
 from enchant.checker import SpellChecker
 import grammar_check
 import html2text

 try:
     reload(sys)  # Python 2
     sys.setdefaultencoding('utf-8')
 except NameError:
     pass         # Python 3


 GRAMMAR_CHECK_IGNORE = ['WHITESPACE_RULE', 'DOUBLE_PUNCTUATION', 'EN_QUOTES[1]',
                         'EN_QUOTES[2]', 'COMMA_PARENTHESIS_WHITESPACE',
                         'ENGLISH_WORD_REPEAT_RULE', 'EN_UNPAIRED_BRACKETS',
                         'ENGLISH_WORD_REPEAT_BEGINNING_RULE', 'CD_NN[1]',
                         'UPPERCASE_SENTENCE_START', 'ALL_OF_THE[1]', 'EN_QUOTES[3]',
                         'THREE_NN[1]', 'HE_VERB_AGR[7]', 'NUMEROUS_DIFFERENT[1]',
                         'LIFE_TIME[1]', 'PERIOD_OF_TIME[1]', 'WITH_OUT[1]', 'LARGE_NUMBER_OF[1]',
                         'MANY_NN_U[3]', 'COMP_THAN[3]', 'MASS_AGREEMENT[1]', 'MANY_NN[1]',
                         'GENERAL_XX[1]', 'EN_A_VS_AN']


 def get_grammar_res(matches):
     """Filter the grammar check result with ignored check types.

        Parameters
        -----------
        matches: list
        Match result of grammar check

        Return
        ---------
        ret: list
        Filtered result
     """
     ret = []
     for match in matches:
         lines = str(match).split('\n')
         lines[0] = lines[0].rstrip()
         is_ignored = False
         for entry in GRAMMAR_CHECK_IGNORE:
             if lines[0].endswith(entry):
                 is_ignored = True
                 break
         if not is_ignored:
             ret.append(match)
     return ret


 def check_doc(file_content, spell_checker, spell_check_ret):
     """A documentation checker checks spelling
        of files.

        Parameters
        -----------
        content: str
        source text to be checked

        spell_checker: enchant.checker.SpellChecker
        Spell checker

        spell_check_res: dict
        Spell check result dictionary maps typo word to occurance times.
     """
     spell_checker.set_text(file_content)
     for error in spell_checker:
         if error.word in spell_check_ret:
             spell_check_ret[error.word] += 1
         else:
             spell_check_ret[error.word] = 1


 class DocParser(HTMLParser):
     """A document parser parsed html file and conduct spelling check
         and grammar check.
     """
     def __init__(self):
         HTMLParser.__init__(self)
         self.__spell_check_res = {}
         self.__grammar_check_res = None
         self.__ignore_tag = False
         self.__is_code_block = False
         self.__in_code_block = False
         self.__dictionary = enchant.DictWithPWL('en_US', 'web-data/mxnet/doc/ignored_words.txt')
         self.__spell_checker = SpellChecker(self.__dictionary)
         self.__parsed_content = ""
         self.__grammar_checker = grammar_check.LanguageTool('en-US')

     def handle_starttag(self, tag, attrs):
         self.__ignore_tag = True if tag.startswith('script') or tag.startswith('option') else False

     def handle_endtag(self, tag):
         pass

     def handle_data(self, data):
         #Ignore url content
         if not self.__ignore_tag and not data.startswith('http'):
             check_doc(data, self.__spell_checker, self.__spell_check_res)


     def get_res(self):
         """return the checking result
         """
         return [self.__spell_check_res, self.__grammar_check_res]


     def clear_res(self):
         """Clean the checking result
         """
         self.__spell_check_res = {}
         self.__grammar_check_res = None


     def check_grammar(self, file_name):
         """Check the grammar of the specified file

            Parameters
            -----------
            file_name: name of the file to be checked
         """
         file_content = html2text.html2text(open(file_name).read())
         file_content = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f]+", u"", file_content)
         self.__grammar_check_res = self.__grammar_checker.check(file_content)


 if __name__ == "__main__":
     BUILD_HTML_DIR = '../../../docs/_build/html'
     CHINESE_HTML_DIR = '../../../docs/_build/html/zh'
     STATIC_HTML_DIR = '../../../docs/_build/html/_static'
     DOC_PARSER = DocParser()
     ALL_CLEAR = True
     for root, _, files in os.walk(BUILD_HTML_DIR):
         if root.startswith(CHINESE_HTML_DIR) or root.startswith(STATIC_HTML_DIR):
             continue
         for read_file in files:
             if not read_file.endswith('.html') or read_file == 'README.html' or '_zh' in read_file:
                 continue
             rd_file = open(os.path.join(root, read_file), 'r')
             content = rd_file.read()
             DOC_PARSER.clear_res()
             DOC_PARSER.feed(content)
             DOC_PARSER.check_grammar(os.path.join(root, read_file))
             spell_check_res = DOC_PARSER.get_res()[0]
             grammar_check_res = DOC_PARSER.get_res()[1]
             if len(spell_check_res) > 0:
                 print(f"{os.path.join(root, read_file)} has typo:")
                 print(f"{spell_check_res}\n")
                 ALL_CLEAR = False
     if ALL_CLEAR:
         print("No typo is found.")
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	#pylint: disable=no-member, too-many-instance-attributes
	"""This script uses pyenchant to check spelling for MXNet
	documentation website.
	An exclude list is provided to avoid checking specific word,
	such as NDArray.
	"""
	from __future__ import print_function

	import os
	import sys
	import re
	from HTMLParser import HTMLParser
	import enchant
	from enchant.checker import SpellChecker
	import grammar_check
	import html2text

	try:
	reload(sys) # Python 2
	sys.setdefaultencoding('utf-8')
	except NameError:
	pass # Python 3


	GRAMMAR_CHECK_IGNORE = ['WHITESPACE_RULE', 'DOUBLE_PUNCTUATION', 'EN_QUOTES[1]',
	'EN_QUOTES[2]', 'COMMA_PARENTHESIS_WHITESPACE',
	'ENGLISH_WORD_REPEAT_RULE', 'EN_UNPAIRED_BRACKETS',
	'ENGLISH_WORD_REPEAT_BEGINNING_RULE', 'CD_NN[1]',
	'UPPERCASE_SENTENCE_START', 'ALL_OF_THE[1]', 'EN_QUOTES[3]',
	'THREE_NN[1]', 'HE_VERB_AGR[7]', 'NUMEROUS_DIFFERENT[1]',
	'LIFE_TIME[1]', 'PERIOD_OF_TIME[1]', 'WITH_OUT[1]', 'LARGE_NUMBER_OF[1]',
	'MANY_NN_U[3]', 'COMP_THAN[3]', 'MASS_AGREEMENT[1]', 'MANY_NN[1]',
	'GENERAL_XX[1]', 'EN_A_VS_AN']



	def get_grammar_res(matches):
	"""Filter the grammar check result with ignored check types.

	Parameters
	-----------
	matches: list
	Match result of grammar check

	Return
	---------
	ret: list
	Filtered result
	"""
	ret = []
	for match in matches:
	lines = str(match).split('\n')
	lines[0] = lines[0].rstrip()
	is_ignored = False
	for entry in GRAMMAR_CHECK_IGNORE:
	if lines[0].endswith(entry):
	is_ignored = True
	break
	if not is_ignored:
	ret.append(match)
	return ret



	def check_doc(file_content, spell_checker, spell_check_ret):
	"""A documentation checker checks spelling
	of files.

	Parameters
	-----------
	content: str
	source text to be checked

	spell_checker: enchant.checker.SpellChecker
	Spell checker

	spell_check_res: dict
	Spell check result dictionary maps typo word to occurance times.
	"""
	spell_checker.set_text(file_content)
	for error in spell_checker:
	if error.word in spell_check_ret:
	spell_check_ret[error.word] += 1
	else:
	spell_check_ret[error.word] = 1


	class DocParser(HTMLParser):
	"""A document parser parsed html file and conduct spelling check
	and grammar check.
	"""
	def __init__(self):
	HTMLParser.__init__(self)
	self.__spell_check_res = {}
	self.__grammar_check_res = None
	self.__ignore_tag = False
	self.__is_code_block = False
	self.__in_code_block = False
	self.__dictionary = enchant.DictWithPWL('en_US', 'web-data/mxnet/doc/ignored_words.txt')
	self.__spell_checker = SpellChecker(self.__dictionary)
	self.__parsed_content = ""
	self.__grammar_checker = grammar_check.LanguageTool('en-US')

	def handle_starttag(self, tag, attrs):
	self.__ignore_tag = True if tag.startswith('script') or tag.startswith('option') else False

	def handle_endtag(self, tag):
	pass

	def handle_data(self, data):
	#Ignore url content
	if not self.__ignore_tag and not data.startswith('http'):
	check_doc(data, self.__spell_checker, self.__spell_check_res)


	def get_res(self):
	"""return the checking result
	"""
	return [self.__spell_check_res, self.__grammar_check_res]


	def clear_res(self):
	"""Clean the checking result
	"""
	self.__spell_check_res = {}
	self.__grammar_check_res = None


	def check_grammar(self, file_name):
	"""Check the grammar of the specified file

	Parameters
	-----------
	file_name: name of the file to be checked
	"""
	file_content = html2text.html2text(open(file_name).read())
	file_content = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f]+", u"", file_content)
	self.__grammar_check_res = self.__grammar_checker.check(file_content)


	if __name__ == "__main__":
	BUILD_HTML_DIR = '../../../docs/_build/html'
	CHINESE_HTML_DIR = '../../../docs/_build/html/zh'
	STATIC_HTML_DIR = '../../../docs/_build/html/_static'
	DOC_PARSER = DocParser()
	ALL_CLEAR = True
	for root, _, files in os.walk(BUILD_HTML_DIR):
	if root.startswith(CHINESE_HTML_DIR) or root.startswith(STATIC_HTML_DIR):
	continue
	for read_file in files:
	if not read_file.endswith('.html') or read_file == 'README.html' or '_zh' in read_file:
	continue
	rd_file = open(os.path.join(root, read_file), 'r')
	content = rd_file.read()
	DOC_PARSER.clear_res()
	DOC_PARSER.feed(content)
	DOC_PARSER.check_grammar(os.path.join(root, read_file))
	spell_check_res = DOC_PARSER.get_res()[0]
	grammar_check_res = DOC_PARSER.get_res()[1]
	if len(spell_check_res) > 0:
	print(f"{os.path.join(root, read_file)} has typo:")
	print(f"{spell_check_res}\n")
	ALL_CLEAR = False
	if ALL_CLEAR:
	print("No typo is found.")