blob: 26aadc5ea53ccff714320c6127ba72edd1d73a79 [file]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This scipt is served to fetch GitHub issues into a json file
from __future__ import print_function
import os
import requests
import json
import re
import pandas as pd
import logging
class DataFetcher:
def __init__(self,
github_user=os.environ.get("github_user"),
github_oauth_token=os.environ.get("github_oauth_token"),
repo=os.environ.get("repo")):
"""
This DataFetcher serves to fetch issues data
Args:
github_user(str): the github id. ie: "CathyZhang0822"
github_oauth_token(str): the github oauth token, paired with github_user to realize authorization
repo(str): the repo name
"""
self.github_user = github_user
self.github_oauth_token = github_oauth_token
self.repo = repo
self.auth = (self.github_user, self.github_oauth_token)
self.json_data = None
def cleanstr(self, raw_string, sub_string):
"""
This method is to convert all non-alphanumeric charaters from
raw_string into substring
"""
clean = re.sub("[^0-9a-zA-Z]", sub_string, raw_string)
return clean.lower()
def count_pages(self, state):
"""
This method is to count how many pages of issues/labels in total
state can be "open"/"closed"/"all"
"""
url = 'https://api.github.com/repos/%s/issues' % self.repo
response = requests.get(url, {'state': state},
auth=self.auth)
assert response.status_code == 200, "Authorization failed"
if "link" not in response.headers:
return 1
return int(self.cleanstr(response.headers['link'], " ").split()[-3])
def fetch_issues(self, issue_nums):
"""
This method is to fetch issues data
issue_num: a list of issue ids
return issues' data in pandas dataframe format
"""
assert issue_nums != [], "Empty Input!"
logging.info("Reading issues:{}".format(", ".join([str(num) for num in issue_nums])))
data = []
for number in issue_nums:
url = 'https://api.github.com/repos/' + self.repo + '/issues/' + str(number)
response = requests.get(url, auth=self.auth)
item = response.json()
assert 'title' in item, "{} issues doesn't exist!".format(str(number))
data += [{'id': str(number), 'title': item['title'], 'body': item['body']}]
return pd.DataFrame(data)
def data2json(self, state, labels=None, other_labels=False):
"""
This method is to store issues' data into a json file, return json file's name
state can be either "open"/"closed"/"all"
labels is a list of target labels we are interested in
other_labels can be either "True"/"False"
"""
assert state in set(['all', 'open', 'closed']), "Invalid State!"
logging.info("Reading {} issues..".format(state))
pages = self.count_pages(state)
data = []
for x in range(1, pages+1):
url = 'https://api.github.com/repos/' + self.repo + '/issues?page=' + str(x) \
+ '&per_page=30'.format(repo=self.repo)
response = requests.get(url,
{'state': state,
'base': 'master',
'sort': 'created'},
auth=self.auth)
for item in response.json():
if "pull_request" in item:
continue
if "labels" in item:
issue_labels=list(set([item['labels'][i]['name'] for i in range(len(item['labels']))]))
else:
continue
if labels is not None:
# fetch issue which has at least one target label
for label in labels:
if label in issue_labels:
if other_labels:
# besides target labels, we still want other labels
data += [{'id': item['number'],'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
else:
# only record target labels
if(label in set(["Feature", "Call for Contribution", "Feature request"])):
label = "Feature"
data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': label}]
# if have this break, then we only pick up the first target label
break
else:
# fetch all issues
data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
self.json_data = data
s_labels = "_".join(labels) if labels is not None else "all_labels"
filename = "{}_data.json_{}".format(state, s_labels)
logging.info("Writing json file..")
with open(filename, 'w') as write_file:
json.dump(data, write_file)
logging.info("{} json file is ready!".format(filename))
return filename