blob: f78d87b221451202b686b4d8ac9800d0097cc629 [file] [log] [blame]
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""""Build a classifier using a subset of the DBpedia categories"""
from __future__ import print_function
from bz2 import BZ2File
from time import time
import urllib2
from urllib import quote
DBPEDIA_URL_PREFIX = "http://dbpedia.org/resource/"
def load_topics_from_tsv(filename, server_url):
lines = open(filename, 'rb').readlines()
count = 0
previous = time()
for line in lines:
concept, broader_concepts, primary_topic = line.split('\t')
primary_topic = DBPEDIA_URL_PREFIX + primary_topic.strip()
concept = DBPEDIA_URL_PREFIX + concept.strip()
if broader_concepts == '\\N':
# postgresql marker for NULL values in TSV files
broader_concepts = []
else:
broader_concepts = [DBPEDIA_URL_PREFIX + b.strip()
for b in broader_concepts.split()]
url = server_url + "?id=%s&primary_topic=%s" % (
concept, primary_topic)
for broader_concept in broader_concepts:
url += "&broader=%s" % quote(broader_concept)
# force POST verb with data keyword
request = urllib2.Request(url, data="")
opener = urllib2.build_opener()
opener.open(request).read()
count += 1
if count % 1000 == 0:
delta, previous = time() - previous, time()
print("Imported concepts %03d/%03d in %06.3fs"
% (count, len(lines), delta))
def load_examples_from_tsv(filename, server_url):
if filename.endswith('.bz2'):
lines = BZ2File(filename).readlines()
else:
lines = open(filename, 'rb').readlines()
count = 0
previous = time()
for line in lines:
example_id, categories, text = line.split('\t')
example_id = DBPEDIA_URL_PREFIX + example_id
categories = [DBPEDIA_URL_PREFIX + c for c in categories.split()]
url = server_url + "?example_id=%s" % example_id
for category in categories:
url += "&concept=%s" % quote(category)
request = urllib2.Request(url, data=text)
request.add_header('Content-Type', 'text/plain')
opener = urllib2.build_opener()
opener.open(request).read()
count += 1
if count % 1000 == 0:
delta, previous = time() - previous, time()
print("Processed articles %03d/%03d in %06.3fs"
% (count, len(lines), delta))
if __name__ == "__main__":
import sys
topics_filename = sys.argv[1]
examples_filename = sys.argv[2]
topic_model_url = sys.argv[3]
print("Loading taxonomy definition from:", topics_filename)
t0 = time()
load_topics_from_tsv(topics_filename,
topic_model_url + '/concept')
print("Taxonomy loaded in %0.3fs." % (time() - t0))
print("Loading training set from:", examples_filename)
t0 = time()
load_examples_from_tsv(examples_filename,
topic_model_url + '/trainingset')
print("Dataset loaded in %0.3fs." % (time() - t0))
print("Training model from dataset...")
# Force usage of the POST HTTP verb:
t0 = time()
request = urllib2.Request(topic_model_url + '/trainer', data="")
opener = urllib2.build_opener().open(request).read()
print("Model updated in %0.3fs." % (time() - t0))