blob: 067e8f2a3a4e92ae4e247bd8078c6b6b8db21683 [file] [log] [blame] [view]
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Preparation
```
use webspam;
delete jar ./tmp/hivemall.jar;
add jar ./tmp/hivemall.jar;
source ./tmp/define-all.hive;
```
# PA1
```sql
drop table webspam_pa1_model1;
create table webspam_pa1_model1 as
select
feature,
cast(voted_avg(weight) as float) as weight
from
(select
train_pa1(features,label) as (feature,weight) -- sparse model
-- train_pa1(features,label,"-dense -dims 33554432") as (feature,weight)
from
webspam_train_x3
) t
group by feature;
create or replace view webspam_pa1_predict1
as
select
t.rowid,
sum(m.weight * t.value) as total_weight,
case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from
webspam_test_exploded t LEFT OUTER JOIN
webspam_pa1_model1 m ON (t.feature = m.feature)
group by
t.rowid;
create or replace view webspam_pa1_submit1 as
select
t.rowid,
t.label as actual,
pd.label as predicted
from
webspam_test t JOIN webspam_pa1_predict1 pd
on (t.rowid = pd.rowid);
select count(1)/70000 from webspam_pa1_submit1
where actual = predicted;
```
> Prediction accuracy: 0.9628428571428571
# AROW
```sql
drop table webspam_arow_model1;
create table webspam_arow_model1 as
select
feature,
argmin_kld(weight,covar)as weight
from
(select
train_arow(features,label) as (feature,weight,covar)
from
webspam_train_x3
) t
group by feature;
create or replace view webspam_arow_predict1
as
select
t.rowid,
sum(m.weight * t.value) as total_weight,
case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from
webspam_test_exploded t LEFT OUTER JOIN
webspam_arow_model1 m ON (t.feature = m.feature)
group by
t.rowid;
create or replace view webspam_arow_submit1 as
select
t.rowid,
t.label as actual,
pd.label as predicted
from
webspam_test t JOIN webspam_arow_predict1 pd
on (t.rowid = pd.rowid);
select count(1)/70000 from webspam_arow_submit1
where actual = predicted;
```
> Prediction accuracy: 0.9747428571428571
# SCW1
```sql
drop table webspam_scw_model1;
create table webspam_scw_model1 as
select
feature,
argmin_kld(weight,covar)as weight
from
(select
train_scw(features,label) as (feature,weight,covar)
from
webspam_train_x3
) t
group by feature;
create or replace view webspam_scw_predict1
as
select
t.rowid,
sum(m.weight * t.value) as total_weight,
case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from
webspam_test_exploded t LEFT OUTER JOIN
webspam_scw_model1 m ON (t.feature = m.feature)
group by
t.rowid;
create or replace view webspam_scw_submit1 as
select
t.rowid,
t.label as actual,
pd.label as predicted
from
webspam_test t JOIN webspam_scw_predict1 pd
on (t.rowid = pd.rowid);
select count(1)/70000 from webspam_scw_submit1
where actual = predicted;
```
> Prediction accuracy: 0.9778714285714286