blob: 206960d29f9bb86608a8c0b67e75a8196fe6c30f [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#pylint: disable=no-member, too-many-instance-attributes
"""This script uses pyenchant to check spelling for MXNet
documentation website.
An exclude list is provided to avoid checking specific word,
such as NDArray.
"""
from __future__ import print_function
import os
import sys
import re
from HTMLParser import HTMLParser
import enchant
from enchant.checker import SpellChecker
import grammar_check
import html2text
try:
reload(sys) # Python 2
sys.setdefaultencoding('utf-8')
except NameError:
pass # Python 3
GRAMMAR_CHECK_IGNORE = ['WHITESPACE_RULE', 'DOUBLE_PUNCTUATION', 'EN_QUOTES[1]',
'EN_QUOTES[2]', 'COMMA_PARENTHESIS_WHITESPACE',
'ENGLISH_WORD_REPEAT_RULE', 'EN_UNPAIRED_BRACKETS',
'ENGLISH_WORD_REPEAT_BEGINNING_RULE', 'CD_NN[1]',
'UPPERCASE_SENTENCE_START', 'ALL_OF_THE[1]', 'EN_QUOTES[3]',
'THREE_NN[1]', 'HE_VERB_AGR[7]', 'NUMEROUS_DIFFERENT[1]',
'LIFE_TIME[1]', 'PERIOD_OF_TIME[1]', 'WITH_OUT[1]', 'LARGE_NUMBER_OF[1]',
'MANY_NN_U[3]', 'COMP_THAN[3]', 'MASS_AGREEMENT[1]', 'MANY_NN[1]',
'GENERAL_XX[1]', 'EN_A_VS_AN']
def get_grammar_res(matches):
"""Filter the grammar check result with ignored check types.
Parameters
-----------
matches: list
Match result of grammar check
Return
---------
ret: list
Filtered result
"""
ret = []
for match in matches:
lines = str(match).split('\n')
lines[0] = lines[0].rstrip()
is_ignored = False
for entry in GRAMMAR_CHECK_IGNORE:
if lines[0].endswith(entry):
is_ignored = True
break
if not is_ignored:
ret.append(match)
return ret
def check_doc(file_content, spell_checker, spell_check_ret):
"""A documentation checker checks spelling
of files.
Parameters
-----------
content: str
source text to be checked
spell_checker: enchant.checker.SpellChecker
Spell checker
spell_check_res: dict
Spell check result dictionary maps typo word to occurance times.
"""
spell_checker.set_text(file_content)
for error in spell_checker:
if error.word in spell_check_ret:
spell_check_ret[error.word] += 1
else:
spell_check_ret[error.word] = 1
class DocParser(HTMLParser):
"""A document parser parsed html file and conduct spelling check
and grammar check.
"""
def __init__(self):
HTMLParser.__init__(self)
self.__spell_check_res = {}
self.__grammar_check_res = None
self.__ignore_tag = False
self.__is_code_block = False
self.__in_code_block = False
self.__dictionary = enchant.DictWithPWL('en_US', 'web-data/mxnet/doc/ignored_words.txt')
self.__spell_checker = SpellChecker(self.__dictionary)
self.__parsed_content = ""
self.__grammar_checker = grammar_check.LanguageTool('en-US')
def handle_starttag(self, tag, attrs):
self.__ignore_tag = True if tag.startswith('script') or tag.startswith('option') else False
def handle_endtag(self, tag):
pass
def handle_data(self, data):
#Ignore url content
if not self.__ignore_tag and not data.startswith('http'):
check_doc(data, self.__spell_checker, self.__spell_check_res)
def get_res(self):
"""return the checking result
"""
return [self.__spell_check_res, self.__grammar_check_res]
def clear_res(self):
"""Clean the checking result
"""
self.__spell_check_res = {}
self.__grammar_check_res = None
def check_grammar(self, file_name):
"""Check the grammar of the specified file
Parameters
-----------
file_name: name of the file to be checked
"""
file_content = html2text.html2text(open(file_name).read())
file_content = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f]+", u"", file_content)
self.__grammar_check_res = self.__grammar_checker.check(file_content)
if __name__ == "__main__":
BUILD_HTML_DIR = '../../../docs/_build/html'
CHINESE_HTML_DIR = '../../../docs/_build/html/zh'
STATIC_HTML_DIR = '../../../docs/_build/html/_static'
DOC_PARSER = DocParser()
ALL_CLEAR = True
for root, _, files in os.walk(BUILD_HTML_DIR):
if root.startswith(CHINESE_HTML_DIR) or root.startswith(STATIC_HTML_DIR):
continue
for read_file in files:
if not read_file.endswith('.html') or read_file == 'README.html' or '_zh' in read_file:
continue
rd_file = open(os.path.join(root, read_file), 'r')
content = rd_file.read()
DOC_PARSER.clear_res()
DOC_PARSER.feed(content)
DOC_PARSER.check_grammar(os.path.join(root, read_file))
spell_check_res = DOC_PARSER.get_res()[0]
grammar_check_res = DOC_PARSER.get_res()[1]
if len(spell_check_res) > 0:
print(f"{os.path.join(root, read_file)} has typo:")
print(f"{spell_check_res}\n")
ALL_CLEAR = False
if ALL_CLEAR:
print("No typo is found.")