services/github-bots/PredictLabels/DataFetcher.py - mxnet-ci - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # This scipt is served to fetch GitHub issues into a json file
 from __future__ import print_function
 import os
 import requests
 import json
 import re
 import pandas as pd
 import logging


 class DataFetcher:

     def __init__(self,
                  github_user=os.environ.get("github_user"),
                  github_oauth_token=os.environ.get("github_oauth_token"),
                  repo=os.environ.get("repo")):
         """
         This DataFetcher serves to fetch issues data
         Args:
             github_user(str): the github id. ie: "CathyZhang0822"
             github_oauth_token(str): the github oauth token, paired with github_user to realize authorization
             repo(str): the repo name
         """
         self.github_user = github_user
         self.github_oauth_token = github_oauth_token
         self.repo = repo
         self.auth = (self.github_user, self.github_oauth_token)
         self.json_data = None

     def cleanstr(self, raw_string, sub_string):
         """
         This method is to convert all non-alphanumeric charaters from
         raw_string into substring
         """
         clean = re.sub("[^0-9a-zA-Z]", sub_string, raw_string)
         return clean.lower()

     def count_pages(self, state):
         """
         This method is to count how many pages of issues/labels in total
         state can be "open"/"closed"/"all"
         """
         url = 'https://api.github.com/repos/%s/issues' % self.repo
         response = requests.get(url, {'state': state},
                                 auth=self.auth)
         assert response.status_code == 200, "Authorization failed"
         if "link" not in response.headers:
             return 1
         return int(self.cleanstr(response.headers['link'], " ").split()[-3])

     def fetch_issues(self, issue_nums):
         """
         This method is to fetch issues data
         issue_num: a list of issue ids
         return issues' data in pandas dataframe format
         """
         assert issue_nums != [], "Empty Input!"
         logging.info("Reading issues:{}".format(", ".join([str(num) for num in issue_nums])))
         data = []
         for number in issue_nums:
             url = 'https://api.github.com/repos/' + self.repo + '/issues/' + str(number)
             response = requests.get(url, auth=self.auth)
             item = response.json()
             assert 'title' in item, "{} issues doesn't exist!".format(str(number))
             data += [{'id': str(number), 'title': item['title'], 'body': item['body']}]
         return pd.DataFrame(data)

     def data2json(self, state, labels=None, other_labels=False):
         """
         This method is to store issues' data into a json file, return json file's name
         state can be either "open"/"closed"/"all"
         labels is a list of target labels we are interested in
         other_labels can be either "True"/"False"
         """
         assert state in set(['all', 'open', 'closed']), "Invalid State!"
         logging.info("Reading {} issues..".format(state))
         pages = self.count_pages(state)
         data = []
         for x in range(1, pages+1):
             url = 'https://api.github.com/repos/' + self.repo + '/issues?page=' + str(x) \
                   + '&per_page=30'.format(repo=self.repo)
             response = requests.get(url,
                                     {'state': state,
                                      'base': 'master',
                                      'sort': 'created'},
                                     auth=self.auth)
             for item in response.json():
                 if "pull_request" in item:
                     continue
                 if "labels" in item:
                     issue_labels=list(set([item['labels'][i]['name'] for i in range(len(item['labels']))]))
                 else:
                     continue
                 if labels is not None:
                     # fetch issue which has at least one target label
                     for label in labels:
                         if label in issue_labels:
                             if other_labels:
                                 # besides target labels, we still want other labels
                                 data += [{'id': item['number'],'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
                             else:
                                 # only record target labels
                                 if(label in set(["Feature", "Call for Contribution", "Feature request"])):
                                     label = "Feature"
                                 data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': label}]
                             # if have this break, then we only pick up the first target label
                             break
                 else:
                     # fetch all issues
                     data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
         self.json_data = data
         s_labels = "_".join(labels) if labels is not None else "all_labels"
         filename = "{}_data.json_{}".format(state, s_labels)
         logging.info("Writing json file..")
         with open(filename, 'w') as write_file:
             json.dump(data, write_file)
         logging.info("{} json file is ready!".format(filename))
         return filename
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# This scipt is served to fetch GitHub issues into a json file
	from __future__ import print_function
	import os
	import requests
	import json
	import re
	import pandas as pd
	import logging


	class DataFetcher:

	def __init__(self,
	github_user=os.environ.get("github_user"),
	github_oauth_token=os.environ.get("github_oauth_token"),
	repo=os.environ.get("repo")):
	"""
	This DataFetcher serves to fetch issues data
	Args:
	github_user(str): the github id. ie: "CathyZhang0822"
	github_oauth_token(str): the github oauth token, paired with github_user to realize authorization
	repo(str): the repo name
	"""
	self.github_user = github_user
	self.github_oauth_token = github_oauth_token
	self.repo = repo
	self.auth = (self.github_user, self.github_oauth_token)
	self.json_data = None

	def cleanstr(self, raw_string, sub_string):
	"""
	This method is to convert all non-alphanumeric charaters from
	raw_string into substring
	"""
	clean = re.sub("[^0-9a-zA-Z]", sub_string, raw_string)
	return clean.lower()

	def count_pages(self, state):
	"""
	This method is to count how many pages of issues/labels in total
	state can be "open"/"closed"/"all"
	"""
	url = 'https://api.github.com/repos/%s/issues' % self.repo
	response = requests.get(url, {'state': state},
	auth=self.auth)
	assert response.status_code == 200, "Authorization failed"
	if "link" not in response.headers:
	return 1
	return int(self.cleanstr(response.headers['link'], " ").split()[-3])

	def fetch_issues(self, issue_nums):
	"""
	This method is to fetch issues data
	issue_num: a list of issue ids
	return issues' data in pandas dataframe format
	"""
	assert issue_nums != [], "Empty Input!"
	logging.info("Reading issues:{}".format(", ".join([str(num) for num in issue_nums])))
	data = []
	for number in issue_nums:
	url = 'https://api.github.com/repos/' + self.repo + '/issues/' + str(number)
	response = requests.get(url, auth=self.auth)
	item = response.json()
	assert 'title' in item, "{} issues doesn't exist!".format(str(number))
	data += [{'id': str(number), 'title': item['title'], 'body': item['body']}]
	return pd.DataFrame(data)

	def data2json(self, state, labels=None, other_labels=False):
	"""
	This method is to store issues' data into a json file, return json file's name
	state can be either "open"/"closed"/"all"
	labels is a list of target labels we are interested in
	other_labels can be either "True"/"False"
	"""
	assert state in set(['all', 'open', 'closed']), "Invalid State!"
	logging.info("Reading {} issues..".format(state))
	pages = self.count_pages(state)
	data = []
	for x in range(1, pages+1):
	url = 'https://api.github.com/repos/' + self.repo + '/issues?page=' + str(x) \
	+ '&per_page=30'.format(repo=self.repo)
	response = requests.get(url,
	{'state': state,
	'base': 'master',
	'sort': 'created'},
	auth=self.auth)
	for item in response.json():
	if "pull_request" in item:
	continue
	if "labels" in item:
	issue_labels=list(set([item['labels'][i]['name'] for i in range(len(item['labels']))]))
	else:
	continue
	if labels is not None:
	# fetch issue which has at least one target label
	for label in labels:
	if label in issue_labels:
	if other_labels:
	# besides target labels, we still want other labels
	data += [{'id': item['number'],'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
	else:
	# only record target labels
	if(label in set(["Feature", "Call for Contribution", "Feature request"])):
	label = "Feature"
	data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': label}]
	# if have this break, then we only pick up the first target label
	break
	else:
	# fetch all issues
	data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
	self.json_data = data
	s_labels = "_".join(labels) if labels is not None else "all_labels"
	filename = "{}_data.json_{}".format(state, s_labels)
	logging.info("Writing json file..")
	with open(filename, 'w') as write_file:
	json.dump(data, write_file)
	logging.info("{} json file is ready!".format(filename))
	return filename