Re-structure and design preparator and algo.
diff --git a/README.md b/README.md
index 75efa05..95a10b8 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,11 @@
# Release Information
+## Version 4.0
+
+Re-structure and design preparator and algo. less memory usage and run time is faster.
+Move BIDMach, VW & SPPMI algo changes to `bidmach` branch temporarily.
+
## Version 3.1
Fix DataSource to read "content", "e-mail", and use label "spam" for tutorial data.
diff --git a/build.sbt b/build.sbt
index 31188ae..62021b9 100644
--- a/build.sbt
+++ b/build.sbt
@@ -1,22 +1,12 @@
-
name := "org.template.textclassification"
organization := "io.prediction"
scalaVersion := "2.10.5"
-libraryDependencies ++= Seq(
- "io.prediction" % "core_2.10" % pioVersion.value % "provided",
- "org.apache.spark" %% "spark-core" % "1.4.1" % "provided",
- "org.apache.spark" %% "spark-mllib" % "1.4.1" % "provided",
- "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly(),
- "com.github.johnlangford" % "vw-jni" % "8.0.0",
- "org.xerial.snappy" % "snappy-java" % "1.1.1.7"
-)
+organization := "io.prediction"
-mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
- {
- case y if y.startsWith("doc") => MergeStrategy.discard
- case x => old(x)
- }
-}
+libraryDependencies ++= Seq(
+ "io.prediction" %% "core" % pioVersion.value % "provided",
+ "org.apache.spark" %% "spark-core" % "1.4.1" % "provided",
+ "org.apache.spark" %% "spark-mllib" % "1.4.1" % "provided")
diff --git a/data/.DS_Store b/data/.DS_Store
deleted file mode 100644
index 5008ddf..0000000
--- a/data/.DS_Store
+++ /dev/null
Binary files differ
diff --git a/getnativepath.java b/getnativepath.java
deleted file mode 100644
index 9a7f2c0..0000000
--- a/getnativepath.java
+++ /dev/null
@@ -1,7 +0,0 @@
-public class getnativepath {
- public static void main(String [] args)
- {
- String v = System.getProperty("java.library.path");
- System.out.print(v);
- }
-}
\ No newline at end of file
diff --git a/lib/Apache_Commons_Math_LICENSE.txt b/lib/Apache_Commons_Math_LICENSE.txt
deleted file mode 100755
index 0333373..0000000
--- a/lib/Apache_Commons_Math_LICENSE.txt
+++ /dev/null
@@ -1,387 +0,0 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-
-APACHE COMMONS MATH DERIVATIVE WORKS:
-
-The Apache commons-math library includes a number of subcomponents
-whose implementation is derived from original sources written
-in C or Fortran. License terms of the original sources
-are reproduced below.
-
-===============================================================================
-For the lmder, lmpar and qrsolv Fortran routine from minpack and translated in
-the LevenbergMarquardtOptimizer class in package
-org.apache.commons.math3.optimization.general
-Original source copyright and license statement:
-
-Minpack Copyright Notice (1999) University of Chicago. All rights reserved
-
-Redistribution and use in source and binary forms, with or
-without modification, are permitted provided that the
-following conditions are met:
-
-1. Redistributions of source code must retain the above
-copyright notice, this list of conditions and the following
-disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials
-provided with the distribution.
-
-3. The end-user documentation included with the
-redistribution, if any, must include the following
-acknowledgment:
-
- "This product includes software developed by the
- University of Chicago, as Operator of Argonne National
- Laboratory.
-
-Alternately, this acknowledgment may appear in the software
-itself, if and wherever such third-party acknowledgments
-normally appear.
-
-4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
-WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
-UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
-THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
-OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
-OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
-USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
-THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
-DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
-UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
-BE CORRECTED.
-
-5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
-HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
-ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
-INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
-ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
-PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
-SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
-(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
-EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
-POSSIBILITY OF SUCH LOSS OR DAMAGES.
-===============================================================================
-
-Copyright and license statement for the odex Fortran routine developed by
-E. Hairer and G. Wanner and translated in GraggBulirschStoerIntegrator class
-in package org.apache.commons.math3.ode.nonstiff:
-
-
-Copyright (c) 2004, Ernst Hairer
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-- Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-- Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-===============================================================================
-
-Copyright and license statement for the original lapack fortran routines
-translated in EigenDecompositionImpl class in package
-org.apache.commons.math3.linear:
-
-Copyright (c) 1992-2008 The University of Tennessee. All rights reserved.
-
-$COPYRIGHT$
-
-Additional copyrights may follow
-
-$HEADER$
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-- Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
-- Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer listed
- in this license in the documentation and/or other materials
- provided with the distribution.
-
-- Neither the name of the copyright holders nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-===============================================================================
-
-Copyright and license statement for the original Mersenne twister C
-routines translated in MersenneTwister class in package
-org.apache.commons.math3.random:
-
- Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- 3. The names of its contributors may not be used to endorse or promote
- products derived from this software without specific prior written
- permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-===============================================================================
-
-The class "org.apache.commons.math3.exception.util.LocalizedFormatsTest" is
-an adapted version of "OrekitMessagesTest" test class for the Orekit library
-The "org.apache.commons.math3.analysis.interpolation.HermiteInterpolator"
-has been imported from the Orekit space flight dynamics library.
-
-Th Orekit library is described at:
- https://www.orekit.org/forge/projects/orekit
-The original files are distributed under the terms of the Apache 2 license
-which is: Copyright 2010 CS Communication & Systèmes
diff --git a/lib/Apache_License.txt b/lib/Apache_License.txt
deleted file mode 100644
index d645695..0000000
--- a/lib/Apache_License.txt
+++ /dev/null
@@ -1,202 +0,0 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/lib/BIDMach.jar b/lib/BIDMach.jar
deleted file mode 100644
index 26cc39e..0000000
--- a/lib/BIDMach.jar
+++ /dev/null
Binary files differ
diff --git a/lib/BIDMat.jar b/lib/BIDMat.jar
deleted file mode 100755
index 8d46f5c..0000000
--- a/lib/BIDMat.jar
+++ /dev/null
Binary files differ
diff --git a/lib/HDF5_Copyright.html b/lib/HDF5_Copyright.html
deleted file mode 100644
index 07a71f4..0000000
--- a/lib/HDF5_Copyright.html
+++ /dev/null
@@ -1,160 +0,0 @@
-<html><head>
-<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
- <title>
- HDF5 Copyright Notice and License Terms
- </title>
- </head>
-
-<body bgcolor="#FFFFFF">
-<!-- NEW PAGE -->
-
-
-<hr>
-
-<h3>Copyright Notice and License Terms for
-<br>
-HDF5 (Hierarchical Data Format 5) Software Library and Utilities</h3>
-<hr>
-<p>
-
-
-HDF5 (Hierarchical Data Format 5) Software Library and Utilities
-<br>
-Copyright 2006-2012 by The HDF Group.
-</p><p>
-NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities
-<br>
-Copyright 1998-2006 by the Board of Trustees of the University of Illinois.
-</p><p>
-<strong>All rights reserved.</strong>
-</p><p>
-
-</p><p>
-Redistribution and use in source and binary forms, with or without
-modification, are permitted for any purpose (including commercial purposes)
-provided that the following conditions are met:
-
-</p><p>
-</p><ol>
-<li>
-Redistributions of source code must retain the above copyright notice,
-this list of conditions, and the following disclaimer.
-
-</li><li>
-Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions, and the following disclaimer in the documentation
-and/or materials provided with the distribution.
-
-</li><li>
-In addition, redistributions of modified forms of the source or binary code
-must carry prominent notices stating that the original code was changed and
-the date of the change.
-
-</li><li>
-All publications or advertising materials mentioning features or use of this
-software are asked, but not required, to acknowledge that it was developed
-by The HDF Group and by the National Center for Supercomputing Applications
-at the University of Illinois at Urbana-Champaign and credit the contributors.
-
-</li><li>
-Neither the name of The HDF Group, the name of the University, nor the name
-of any Contributor may be used to endorse or promote products derived from
-this software without specific prior written permission from The HDF Group,
-the University, or the Contributor, respectively.
-</li></ol>
-
-<p>
-<b>DISCLAIMER:</b>
-THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS
-"AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED.
-In no event shall The HDF Group or the Contributors be liable for any damages
-suffered by the users arising out of the use of this software, even if advised
-of the possibility of such damage.
-
-
-</p><hr>
-<hr>
-
-<p>
-Contributors: National Center for Supercomputing Applications (NCSA) at
-the University of Illinois, Fortner Software, Unidata Program Center (netCDF),
-The Independent JPEG Group (JPEG), Jean-loup Gailly and Mark Adler (gzip),
-and Digital Equipment Corporation (DEC).
-
-</p><hr>
-
-<p>
-Portions of HDF5 were developed with support from the Lawrence Berkeley
-National Laboratory (LBNL) and the United States Department of Energy
-under Prime Contract No. DE-AC02-05CH11231.
-
-</p><hr>
-
-<p>
-Portions of HDF5 were developed with support from the University of
-California, Lawrence Livermore National Laboratory (UC LLNL).
-The following statement applies to those portions of the product and must
-be retained in any redistribution of source code, binaries, documentation,
-and/or accompanying materials:
-</p><dir>
- This work was partially produced at the University of California,
- Lawrence Livermore National Laboratory (UC LLNL) under contract
- no. W-7405-ENG-48 (Contract 48) between the U.S. Department of
- Energy (DOE) and The Regents of the University of California
- (University) for the operation of UC LLNL.
- <p>
- <b>DISCLAIMER:</b>
- This work was prepared as an account of work sponsored by an agency
- of the United States Government. Neither the United States Government
- nor the University of California nor any of their employees, makes
- any warranty, express or implied, or assumes any liability or
- responsibility for the accuracy, completeness, or usefulness of any
- information, apparatus, product, or process disclosed, or represents
- that its use would not infringe privately- owned rights. Reference
- herein to any specific commercial products, process, or service by
- trade name, trademark, manufacturer, or otherwise, does not
- necessarily constitute or imply its endorsement, recommendation, or
- favoring by the United States Government or the University of
- California. The views and opinions of authors expressed herein do not
- necessarily state or reflect those of the United States Government or
- the University of California, and shall not be used for advertising
- or product endorsement purposes.
-</p></dir>
-
-<hr>
-
-<!-- DO NOT EDIT THE FOLLOWING 8 LINES; -->
-<!-- THEY ARE AUTOMATICALLY UPDATED BY DOCUMENTATION SOFTWARE. -->
-
-<!-- #BeginLibraryItem "/ed_libs/Footer.lbi" -->
-<address>
-<table border="0" width="100%">
- <tbody><tr valign="top">
- <td align="left">
- <address>
- The HDF Group Help Desk: <img src="HDF5_help.png" align="top" height="16">
- <br>
- Describes HDF5 Release 1.8.9, May 2012.
- </address>
- </td><td width="5%"> </td>
- <td align="right">
- <a href="http://www.hdfgroup.org/HDF5/doc/Copyright.html">Copyright</a> by
- <a href="http://www.hdfgroup.org/">The HDF Group</a>
- <br>
- and the Board of Trustees of the University of Illinois
- </td>
- </tr>
-</tbody></table>
-</address>
-<!-- #EndLibraryItem --><script language="JAVASCRIPT">
-<!--
-document.writeln("Last modified: 5 March 2012")
--->
-</script>Last modified: 5 March 2012
-
-
-
-
-
-
-</body></html>
diff --git a/lib/IScala_license.txt b/lib/IScala_license.txt
deleted file mode 100755
index 0f0a0d0..0000000
--- a/lib/IScala_license.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Copyright (c) 2013-2014 by Mateusz Paprocki and contributors.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/JCUDA_Copyright.txt b/lib/JCUDA_Copyright.txt
deleted file mode 100644
index a47ba68..0000000
--- a/lib/JCUDA_Copyright.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-JCuda - Java bindings for NVIDIA CUDA
-
-Copyright (c) 2008-2012 Marco Hutter - http://www.jcuda.org
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/License.txt b/lib/License.txt
deleted file mode 100644
index 5faa1fb..0000000
--- a/lib/License.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-
-JCuda - Java bindings for NVIDIA CUDA
-
-Copyright (c) 2008-2015 Marco Hutter - http://www.jcuda.org
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
diff --git a/lib/PtPlot_Copyright.txt b/lib/PtPlot_Copyright.txt
deleted file mode 100755
index 7da2f50..0000000
--- a/lib/PtPlot_Copyright.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Below is the copyright agreement for the Ptolemy II system.
-Version: $Id: copyright.txt 57469 2010-03-10 22:04:46Z cxh $
-
-Copyright (c) 1995-2010 The Regents of the University of California.
-All rights reserved.
-
-Permission is hereby granted, without written agreement and without
-license or royalty fees, to use, copy, modify, and distribute this
-software and its documentation for any purpose, provided that the above
-copyright notice and the following two paragraphs appear in all copies
-of this software.
-
-IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
-FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
-ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
-THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGE.
-
-THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
-PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
-CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
-ENHANCEMENTS, OR MODIFICATIONS.
-
-Ptolemy II includes the work of others, to see those copyrights, follow
-the copyright link on the splash page or see copyright.htm.
diff --git a/lib/Scala_License.txt b/lib/Scala_License.txt
deleted file mode 100755
index ba8cd07..0000000
--- a/lib/Scala_License.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-SCALA LICENSE
-
-Copyright (c) 2002-2013 EPFL, Lausanne, unless otherwise specified.
-All rights reserved.
-
-This software was developed by the Programming Methods Laboratory of the
-Swiss Federal Institute of Technology (EPFL), Lausanne, Switzerland.
-
-Permission to use, copy, modify, and distribute this software in source
-or binary form for any purpose with or without fee is hereby granted,
-provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- 3. Neither the name of the EPFL nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGE.
\ No newline at end of file
diff --git a/lib/bidmach_init.scala b/lib/bidmach_init.scala
deleted file mode 100755
index ea021f7..0000000
--- a/lib/bidmach_init.scala
+++ /dev/null
@@ -1,16 +0,0 @@
-import BIDMat.{CMat,CSMat,DMat,Dict,FMat,FND,GMat,GDMat,GIMat,GLMat,GSMat,GSDMat,HMat,IDict,Image,IMat,LMat,Mat,SMat,SBMat,SDMat}
-import BIDMat.MatFunctions._
-import BIDMat.SciFunctions._
-import BIDMat.Solvers._
-import BIDMat.Plotting._
-import BIDMach.Learner
-import BIDMach.models.{FM,GLM,KMeans,KMeansw,LDA,LDAgibbs,Model,NMF,SFA,RandomForest}
-import BIDMach.networks.{DNN}
-import BIDMach.datasources.{DataSource,MatDS,FilesDS,SFilesDS}
-import BIDMach.mixins.{CosineSim,Perplexity,Top,L1Regularizer,L2Regularizer}
-import BIDMach.updaters.{ADAGrad,Batch,BatchNorm,IncMult,IncNorm,Telescoping}
-import BIDMach.causal.{IPTW}
-
-Mat.checkMKL
-Mat.checkCUDA
-
diff --git a/lib/commons-math3-3.2.jar b/lib/commons-math3-3.2.jar
deleted file mode 100644
index f8b7db2..0000000
--- a/lib/commons-math3-3.2.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jcublas-0.7.0a.jar b/lib/jcublas-0.7.0a.jar
deleted file mode 100644
index 9613fb0..0000000
--- a/lib/jcublas-0.7.0a.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jcuda-0.7.0a.jar b/lib/jcuda-0.7.0a.jar
deleted file mode 100644
index 5ce4c19..0000000
--- a/lib/jcuda-0.7.0a.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jcufft-0.7.0a.jar b/lib/jcufft-0.7.0a.jar
deleted file mode 100644
index 72827d4..0000000
--- a/lib/jcufft-0.7.0a.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jcurand-0.7.0a.jar b/lib/jcurand-0.7.0a.jar
deleted file mode 100644
index f51034a..0000000
--- a/lib/jcurand-0.7.0a.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jcusolver-0.7.0a.jar b/lib/jcusolver-0.7.0a.jar
deleted file mode 100644
index 090f0b4..0000000
--- a/lib/jcusolver-0.7.0a.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jcusparse-0.7.0a.jar b/lib/jcusparse-0.7.0a.jar
deleted file mode 100644
index 8fbd9f9..0000000
--- a/lib/jcusparse-0.7.0a.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jhdf5.jar b/lib/jhdf5.jar
deleted file mode 100644
index 9d15b7d..0000000
--- a/lib/jhdf5.jar
+++ /dev/null
Binary files differ
diff --git a/lib/jline-2.11.jar b/lib/jline-2.11.jar
deleted file mode 100755
index 9604bd2..0000000
--- a/lib/jline-2.11.jar
+++ /dev/null
Binary files differ
diff --git a/lib/junit-4.11.jar b/lib/junit-4.11.jar
deleted file mode 100755
index aaf7444..0000000
--- a/lib/junit-4.11.jar
+++ /dev/null
Binary files differ
diff --git a/lib/libJCublas-linux-x86_64.so b/lib/libJCublas-linux-x86_64.so
deleted file mode 100644
index eb729eb..0000000
--- a/lib/libJCublas-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libJCublas2-linux-x86_64.so b/lib/libJCublas2-linux-x86_64.so
deleted file mode 100644
index d15af99..0000000
--- a/lib/libJCublas2-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libJCudaDriver-linux-x86_64.so b/lib/libJCudaDriver-linux-x86_64.so
deleted file mode 100644
index 23c5bec..0000000
--- a/lib/libJCudaDriver-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libJCudaRuntime-linux-x86_64.so b/lib/libJCudaRuntime-linux-x86_64.so
deleted file mode 100644
index 34d276f..0000000
--- a/lib/libJCudaRuntime-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libJCufft-linux-x86_64.so b/lib/libJCufft-linux-x86_64.so
deleted file mode 100644
index 387ef92..0000000
--- a/lib/libJCufft-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libJCurand-linux-x86_64.so b/lib/libJCurand-linux-x86_64.so
deleted file mode 100644
index 15079cc..0000000
--- a/lib/libJCurand-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libJCusolver-linux-x86_64.so b/lib/libJCusolver-linux-x86_64.so
deleted file mode 100644
index 0a15dc4..0000000
--- a/lib/libJCusolver-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libJCusparse-linux-x86_64.so b/lib/libJCusparse-linux-x86_64.so
deleted file mode 100644
index 71a47f8..0000000
--- a/lib/libJCusparse-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libbidmachcpu-linux-x86_64.so b/lib/libbidmachcpu-linux-x86_64.so
deleted file mode 100755
index fb86860..0000000
--- a/lib/libbidmachcpu-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libbidmachcuda-linux-x86_64.so b/lib/libbidmachcuda-linux-x86_64.so
deleted file mode 100755
index 68c31ba..0000000
--- a/lib/libbidmachcuda-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libbidmatcuda-linux-x86_64.so b/lib/libbidmatcuda-linux-x86_64.so
deleted file mode 100755
index 3ceacfd..0000000
--- a/lib/libbidmatcuda-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libbidmatmkl-linux-x86_64.so b/lib/libbidmatmkl-linux-x86_64.so
deleted file mode 100755
index af19147..0000000
--- a/lib/libbidmatmkl-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libcaffe-linux-x86_64.so b/lib/libcaffe-linux-x86_64.so
deleted file mode 100755
index 0aae3ec..0000000
--- a/lib/libcaffe-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libcaffe.so b/lib/libcaffe.so
deleted file mode 100755
index e6d83d8..0000000
--- a/lib/libcaffe.so
+++ /dev/null
Binary files differ
diff --git a/lib/libiomp5.so b/lib/libiomp5.so
deleted file mode 100755
index 3b9e725..0000000
--- a/lib/libiomp5.so
+++ /dev/null
Binary files differ
diff --git a/lib/libjhdf.so b/lib/libjhdf.so
deleted file mode 100755
index ff6304c..0000000
--- a/lib/libjhdf.so
+++ /dev/null
Binary files differ
diff --git a/lib/libjhdf5-linux-x86_64.so b/lib/libjhdf5-linux-x86_64.so
deleted file mode 100755
index c3dcb2d..0000000
--- a/lib/libjhdf5-linux-x86_64.so
+++ /dev/null
Binary files differ
diff --git a/lib/libjhdf5.so b/lib/libjhdf5.so
deleted file mode 100755
index c3dcb2d..0000000
--- a/lib/libjhdf5.so
+++ /dev/null
Binary files differ
diff --git a/lib/lz4-1.3.jar b/lib/lz4-1.3.jar
deleted file mode 100755
index ffa8eaf..0000000
--- a/lib/lz4-1.3.jar
+++ /dev/null
Binary files differ
diff --git a/lib/ptplot.jar b/lib/ptplot.jar
deleted file mode 100644
index 9582f1c..0000000
--- a/lib/ptplot.jar
+++ /dev/null
Binary files differ
diff --git a/lib/ptplotapplication.jar b/lib/ptplotapplication.jar
deleted file mode 100755
index cc32dd0..0000000
--- a/lib/ptplotapplication.jar
+++ /dev/null
Binary files differ
diff --git a/src/main/scala/org/template/textclassification/DataSource.scala b/src/main/scala/DataSource.scala
similarity index 65%
rename from src/main/scala/org/template/textclassification/DataSource.scala
rename to src/main/scala/DataSource.scala
index 6a29909..be81932 100644
--- a/src/main/scala/org/template/textclassification/DataSource.scala
+++ b/src/main/scala/DataSource.scala
@@ -1,40 +1,40 @@
package org.template.textclassification
-import grizzled.slf4j.Logger
-import io.prediction.controller.EmptyEvaluationInfo
-import io.prediction.controller.Params
import io.prediction.controller.PDataSource
+import io.prediction.controller.EmptyEvaluationInfo
+import io.prediction.controller.EmptyActualResult
+import io.prediction.controller.Params
import io.prediction.controller.SanityCheck
+import io.prediction.data.storage.Event
import io.prediction.data.store.PEventStore
+
import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
+import grizzled.slf4j.Logger
-
-// 1. Initialize your Data Source parameters. This is
-// where you specify your application name, MyTextApp,
-// and the number of folds that are to be used for
-// cross validation.
-
+/** Define Data Source parameters.
+ * appName is the application name.
+ * evalK is the the number of folds that are to be used for cross validation (optional)
+ */
case class DataSourceParams(
- appName: String,
- evalK: Option[Int]
- ) extends Params
+ appName: String,
+ evalK: Option[Int]
+ ) extends Params
-
-// 2. Define your DataSource component. Remember, you must
-// implement a readTraining method, and, optionally, a
-// readEval method.
-
+/** Define your DataSource component. Remember, you must
+ * implement a readTraining method, and, optionally, a
+ * readEval method.
+ */
class DataSource (
- val dsp : DataSourceParams
- ) extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, ActualResult] {
+ val dsp : DataSourceParams
+) extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, ActualResult] {
@transient lazy val logger = Logger[this.type]
- // Helper function used to store data given
- // a SparkContext.
+ /** Helper function used to store data given a SparkContext. */
private def readEventData(sc: SparkContext) : RDD[Observation] = {
//Get RDD of Events.
PEventStore.find(
@@ -54,8 +54,7 @@
}).cache
}
- // Helper function used to store stop words from
- // event server.
+ /** Helper function used to store stop words from event server. */
private def readStopWords(sc : SparkContext) : Set[String] = {
PEventStore.find(
appName = dsp.appName,
@@ -69,16 +68,15 @@
.toSet
}
-
- // Read in data and stop words from event server
- // and store them in a TrainingData instance.
+ /** Read in data and stop words from event server
+ * and store them in a TrainingData instance.
+ */
override
def readTraining(sc: SparkContext): TrainingData = {
new TrainingData(readEventData(sc), readStopWords(sc))
}
- // Used for evaluation: reads in event data and creates
- // cross-validation folds.
+ /** Used for evaluation: reads in event data and creates cross-validation folds. */
override
def readEval(sc: SparkContext):
Seq[(TrainingData, EmptyEvaluationInfo, RDD[(Query, ActualResult)])] = {
@@ -102,27 +100,28 @@
(train, new EmptyEvaluationInfo, test)
}
}
+
}
-
-// 3. Observation class serving as a wrapper for both our
-// data's class label and document string.
+/** Observation class serving as a wrapper for both our
+ * data's class label and document string.
+ */
case class Observation(
- label : Double,
- text : String,
- category :String
- ) extends Serializable
+ label: Double,
+ text: String,
+ category: String
+) extends Serializable
-// 4. TrainingData class serving as a wrapper for all
-// read in from the Event Server.
+/** TrainingData class serving as a wrapper for all
+ * read in from the Event Server.
+ */
class TrainingData(
- val data : RDD[Observation],
- val stopWords : Set[String]
- ) extends Serializable with SanityCheck {
+ val data : RDD[Observation],
+ val stopWords : Set[String]
+) extends Serializable with SanityCheck {
- // Sanity check to make sure your data is being fed in correctly.
-
- def sanityCheck {
+ /** Sanity check to make sure your data is being fed in correctly. */
+ def sanityCheck(): Unit = {
try {
val obs : Array[Double] = data.takeSample(false, 5).map(_.label)
diff --git a/src/main/scala/Engine.scala b/src/main/scala/Engine.scala
new file mode 100644
index 0000000..5cd5420
--- /dev/null
+++ b/src/main/scala/Engine.scala
@@ -0,0 +1,36 @@
+package org.template.textclassification
+
+import io.prediction.controller.IEngineFactory
+import io.prediction.controller.Engine
+
+/** Define Query class which serves as a wrapper for
+ * new text data.
+ */
+case class Query(text: String) extends Serializable
+
+/** Define PredictedResult class which serves as a
+ * wrapper for a predicted class label and the associated
+ * prediction confidence.
+ */
+case class PredictedResult(
+ category: String,
+ confidence: Double) extends Serializable
+
+/** Define ActualResult class which serves as a wrapper
+ * for an observation's true class label.
+ */
+case class ActualResult(category: String) extends Serializable
+
+/** Define Engine */
+object TextClassificationEngine extends IEngineFactory {
+ def apply() = {
+ new Engine(
+ classOf[DataSource],
+ classOf[Preparator],
+ Map(
+ "nb" -> classOf[NBAlgorithm],
+ "lr" -> classOf[LRAlgorithm]
+ ),
+ classOf[Serving])
+ }
+}
diff --git a/src/main/scala/org/template/textclassification/Evaluation.scala b/src/main/scala/Evaluation.scala
similarity index 65%
rename from src/main/scala/org/template/textclassification/Evaluation.scala
rename to src/main/scala/Evaluation.scala
index d42f5e1..3f3a070 100644
--- a/src/main/scala/org/template/textclassification/Evaluation.scala
+++ b/src/main/scala/Evaluation.scala
@@ -1,14 +1,16 @@
package org.template.textclassification
-import io.prediction.controller._
+import io.prediction.controller.AverageMetric
+import io.prediction.controller.Evaluation
+import io.prediction.controller.EmptyEvaluationInfo
+import io.prediction.controller.EngineParamsGenerator
+import io.prediction.controller.EngineParams
-
-
-// 1. Create an accuracy metric for evaluating our supervised learning model.
+/** Create an accuracy metric for evaluating our supervised learning model. */
case class Accuracy()
extends AverageMetric[EmptyEvaluationInfo, Query, PredictedResult, ActualResult] {
- // Method for calculating prediction accuracy.
+ /** Method for calculating prediction accuracy. */
def calculate(
query: Query,
predicted: PredictedResult,
@@ -17,9 +19,9 @@
}
-
-// 2. Define your evaluation object implementing the accuracy metric defined
-// above.
+/** Define your evaluation object implementing the accuracy metric defined
+ * above.
+ */
object AccuracyEvaluation extends Evaluation {
// Define Engine and Metric used in Evaluation.
@@ -29,15 +31,13 @@
)
}
-
-
-// 3. Set your engine parameters for evaluation procedure.
+/** Set your engine parameters for evaluation procedure.*/
object EngineParamsList extends EngineParamsGenerator {
// Set data source and preparator parameters.
private[this] val baseEP = EngineParams(
dataSourceParams = DataSourceParams(appName = "MyTextApp", evalK = Some(3)),
- preparatorParams = PreparatorParams(nGram = 2, 5000, true)
+ preparatorParams = PreparatorParams(nGram = 2, numFeatures = 500)
)
// Set the algorithm params for which we will assess an accuracy score.
diff --git a/src/main/scala/LRAlgorithm.scala b/src/main/scala/LRAlgorithm.scala
new file mode 100644
index 0000000..1f0fcd3
--- /dev/null
+++ b/src/main/scala/LRAlgorithm.scala
@@ -0,0 +1,111 @@
+package org.template.textclassification
+
+import io.prediction.controller.P2LAlgorithm
+import io.prediction.controller.Params
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.UserDefinedFunction
+
+import grizzled.slf4j.Logger
+
+case class LRAlgorithmParams(regParam: Double) extends Params
+
+class LRAlgorithm(val ap: LRAlgorithmParams)
+ extends P2LAlgorithm[PreparedData, LRModel, Query, PredictedResult] {
+
+ @transient lazy val logger = Logger[this.type]
+
+ def train(sc: SparkContext, pd: PreparedData): LRModel = {
+
+ // Import SQLContext for creating DataFrame.
+ val sql: SQLContext = new SQLContext(sc)
+ import sql.implicits._
+
+ val lr = new LogisticRegression()
+ .setMaxIter(10)
+ .setThreshold(0.5)
+ .setRegParam(ap.regParam)
+
+ val labels: Seq[Double] = pd.categoryMap.keys.toSeq
+
+ val data = labels.foldLeft(pd.transformedData.toDF)( //transform to Spark DataFrame
+ // Add the different binary columns for each label.
+ (data: DataFrame, label: Double) => {
+ // function: multiclass labels --> binary labels
+ val f: UserDefinedFunction = functions.udf((e : Double) => if (e == label) 1.0 else 0.0)
+
+ data.withColumn(label.toInt.toString, f(data("label")))
+ }
+ )
+
+ // Create a logistic regression model for each class.
+ val lrModels : Seq[(Double, LREstimate)] = labels.map(
+ label => {
+ val lab = label.toInt.toString
+
+ val fit = lr.setLabelCol(lab).fit(
+ data.select(lab, "features")
+ )
+
+ // Return (label, feature coefficients, and intercept term.
+ (label, LREstimate(fit.weights.toArray, fit.intercept))
+
+ }
+ )
+
+ new LRModel(
+ tfIdf = pd.tfIdf,
+ categoryMap = pd.categoryMap,
+ lrModels = lrModels
+ )
+ }
+
+ def predict(model: LRModel, query: Query): PredictedResult = {
+ model.predict(query.text)
+ }
+}
+
+case class LREstimate (
+ coefficients : Array[Double],
+ intercept : Double
+) extends Serializable
+
+class LRModel(
+ val tfIdf: TFIDFModel,
+ val categoryMap: Map[Double, String],
+ val lrModels: Seq[(Double, LREstimate)]) extends Serializable {
+
+ /** Enable vector inner product for prediction. */
+ private def innerProduct (x : Array[Double], y : Array[Double]) : Double = {
+ x.zip(y).map(e => e._1 * e._2).sum
+ }
+
+ /** Define prediction rule. */
+ def predict(text: String): PredictedResult = {
+ val x: Array[Double] = tfIdf.transform(text).toArray
+
+ // Logistic Regression binary formula for positive probability.
+ // According to MLLib documentation, class labeled 0 is used as pivot.
+ // Thus, we are using:
+ // log(p1/p0) = log(p1/(1 - p1)) = b0 + xTb =: z
+ // p1 = exp(z) * (1 - p1)
+ // p1 * (1 + exp(z)) = exp(z)
+ // p1 = exp(z)/(1 + exp(z))
+ val pred = lrModels.map(
+ e => {
+ val z = scala.math.exp(innerProduct(e._2.coefficients, x) + e._2.intercept)
+ (e._1, z / (1 + z))
+ }
+ ).maxBy(_._2)
+
+ PredictedResult(categoryMap(pred._1), pred._2)
+ }
+
+ override def toString = s"LR model"
+}
diff --git a/src/main/scala/NBAlgorithm.scala b/src/main/scala/NBAlgorithm.scala
new file mode 100644
index 0000000..b3f6d08
--- /dev/null
+++ b/src/main/scala/NBAlgorithm.scala
@@ -0,0 +1,82 @@
+package org.template.textclassification
+
+import io.prediction.controller.P2LAlgorithm
+import io.prediction.controller.Params
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.classification.NaiveBayes
+import org.apache.spark.mllib.classification.NaiveBayesModel
+import org.apache.spark.mllib.linalg.Vector
+import com.github.fommil.netlib.F2jBLAS
+
+import scala.math._
+
+/** Define parameters for Supervised Learning Model. We are
+ * using a Naive Bayes classifier, which gives us only one
+ * hyperparameter in this stage.
+ */
+case class NBAlgorithmParams(lambda: Double) extends Params
+
+/** Define SupervisedAlgorithm class. */
+class NBAlgorithm(
+ val ap: NBAlgorithmParams
+) extends P2LAlgorithm[PreparedData, NBModel, Query, PredictedResult] {
+
+ /** Train your model. */
+ def train(sc: SparkContext, pd: PreparedData): NBModel = {
+ // Fit a Naive Bayes model using the prepared data.
+ val nb: NaiveBayesModel = NaiveBayes.train(pd.transformedData, ap.lambda)
+
+ new NBModel(
+ tfIdf = pd.tfIdf,
+ categoryMap = pd.categoryMap,
+ nb = nb)
+ }
+
+ /** Prediction method for trained model. */
+ def predict(model: NBModel, query: Query): PredictedResult = {
+ model.predict(query.text)
+ }
+}
+
+class NBModel(
+ val tfIdf: TFIDFModel,
+ val categoryMap: Map[Double, String],
+ val nb: NaiveBayesModel
+) extends Serializable {
+
+ private def innerProduct (x : Array[Double], y : Array[Double]) : Double = {
+ x.zip(y).map(e => e._1 * e._2).sum
+ }
+
+ val normalize = (u: Array[Double]) => {
+ val uSum = u.sum
+
+ u.map(e => e / uSum)
+ }
+
+ private val scoreArray = nb.pi.zip(nb.theta)
+
+ /** Given a document string, return a vector of corresponding
+ * class membership probabilities.
+ * Helper function used to normalize probability scores.
+ * Returns an object of type Array[Double]
+ */
+ private def getScores(doc: String): Array[Double] = {
+ // Vectorize query
+ val x: Vector = tfIdf.transform(doc)
+
+ val z = scoreArray
+ .map(e => innerProduct(e._2, x.toArray) + e._1)
+
+ normalize((0 until z.size).map(k => exp(z(k) - z.max)).toArray)
+ }
+
+ /** Implement predict method for our model using
+ * the prediction rule given in tutorial.
+ */
+ def predict(doc : String) : PredictedResult = {
+ val x: Array[Double] = getScores(doc)
+ val y: (Double, Double) = (nb.labels zip x).maxBy(_._2)
+ new PredictedResult(categoryMap.getOrElse(y._1, ""), y._2)
+ }
+}
diff --git a/src/main/scala/Preparator.scala b/src/main/scala/Preparator.scala
new file mode 100644
index 0000000..c990944
--- /dev/null
+++ b/src/main/scala/Preparator.scala
@@ -0,0 +1,97 @@
+package org.template.textclassification
+
+import io.prediction.controller.PPreparator
+import io.prediction.controller.Params
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.feature.{IDF, IDFModel, HashingTF}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/** Define Preparator parameters. Recall that for our data
+ * representation we are only required to input the n-gram window
+ * components.
+ */
+case class PreparatorParams(
+ nGram: Int,
+ numFeatures: Int = 15000
+) extends Params
+
+/** define your Preparator class */
+class Preparator(pp: PreparatorParams)
+ extends PPreparator[TrainingData, PreparedData] {
+
+ def prepare(sc: SparkContext, td: TrainingData): PreparedData = {
+
+ val tfHasher = new TFHasher(pp.numFeatures, pp.nGram)
+
+ // Convert trainingdata's observation text into TF vector
+ // and then fit a IDF model
+ val idf: IDFModel = new IDF().fit(td.data.map(e => tfHasher.hashTF(e.text)))
+
+ val tfIdfModel = new TFIDFModel(
+ hasher = tfHasher,
+ idf = idf
+ )
+
+ // Transform RDD[Observation] to RDD[(Label, text)]
+ val doc: RDD[(Double, String)] = td.data.map (obs => (obs.label, obs.text))
+
+ // transform RDD[(Label, text)] to RDD[LabeledPoint]
+ val transformedData: RDD[(LabeledPoint)] = tfIdfModel.transform(doc)
+
+ // Finally extract category map, associating label to category.
+ val categoryMap = td.data.map(obs => (obs.label, obs.category)).collectAsMap.toMap
+
+ new PreparedData(
+ tfIdf = tfIdfModel,
+ transformedData = transformedData,
+ categoryMap = categoryMap
+ )
+ }
+
+}
+
+class TFHasher(
+ val numFeatures: Int,
+ val nGram: Int
+) extends Serializable {
+
+ private val hasher = new HashingTF(numFeatures = numFeatures)
+
+ /** Hashing function: Text -> term frequency vector. */
+ def hashTF(text: String): Vector = {
+ val newList : Array[String] = text.split(" ")
+ .sliding(nGram)
+ .map(_.mkString)
+ .toArray
+
+ hasher.transform(newList)
+ }
+}
+
+class TFIDFModel(
+ val hasher: TFHasher,
+ val idf: IDFModel
+) extends Serializable {
+
+ /** trasform text to tf-idf vector. */
+ def transform(text: String): Vector = {
+ // Map(n-gram -> document tf)
+ idf.transform(hasher.hashTF(text))
+ }
+
+ /** transform RDD of (label, text) to RDD of LabeledPoint */
+ def transform(doc: RDD[(Double, String)]): RDD[LabeledPoint] = {
+ doc.map{ case (label, text) => LabeledPoint(label, transform(text)) }
+ }
+}
+
+class PreparedData(
+ val tfIdf: TFIDFModel,
+ val transformedData: RDD[LabeledPoint],
+ val categoryMap: Map[Double, String]
+) extends Serializable
diff --git a/src/main/scala/Serving.scala b/src/main/scala/Serving.scala
new file mode 100644
index 0000000..7dd2573
--- /dev/null
+++ b/src/main/scala/Serving.scala
@@ -0,0 +1,13 @@
+package org.template.textclassification
+
+import io.prediction.controller.LServing
+
+class Serving
+ extends LServing[Query, PredictedResult] {
+
+ override
+ def serve(query: Query,
+ predictedResults: Seq[PredictedResult]): PredictedResult = {
+ predictedResults.maxBy(e => e.confidence)
+ }
+}
diff --git a/src/main/scala/org/template/textclassification/BIDMachLRAlgorithm.scala b/src/main/scala/org/template/textclassification/BIDMachLRAlgorithm.scala
deleted file mode 100644
index 4dea636..0000000
--- a/src/main/scala/org/template/textclassification/BIDMachLRAlgorithm.scala
+++ /dev/null
@@ -1,178 +0,0 @@
-package org.template.textclassification
-
-import java.io.{InputStreamReader, BufferedReader, ByteArrayInputStream, Serializable}
-
-import BIDMat.{CMat,CSMat,DMat,Dict,FMat,FND,GMat,GDMat,GIMat,GLMat,GSMat,GSDMat,HMat,IDict,Image,IMat,LMat,Mat,SMat,SBMat,SDMat}
-import BIDMat.MatFunctions._
-import BIDMat.SciFunctions._
-import BIDMat.Solvers._
-import BIDMat.Plotting._
-import BIDMach.Learner
-import BIDMach.models.{FM,GLM,KMeans,KMeansw,LDA,LDAgibbs,Model,NMF,SFA,RandomForest}
-import BIDMach.networks.{DNN}
-import BIDMach.datasources.{DataSource,MatDS,FilesDS,SFilesDS}
-import BIDMach.mixins.{CosineSim,Perplexity,Top,L1Regularizer,L2Regularizer}
-import BIDMach.updaters.{ADAGrad,Batch,BatchNorm,IncMult,IncNorm,Telescoping}
-import BIDMach.causal.{IPTW}
-
-import io.prediction.controller.{P2LAlgorithm, Params}
-import org.apache.spark.SparkContext
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.mllib.linalg.{DenseVector, SparseVector}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
-
-case class BIDMachLRAlgorithmParams (
- regParam : Double
- ) extends Params
-
-
-class BIDMachLRAlgorithm(
- val sap: BIDMachLRAlgorithmParams
- ) extends P2LAlgorithm[PreparedData, NativeLRModel, Query, PredictedResult] {
- // Train your model.
- def train(sc: SparkContext, pd: PreparedData): NativeLRModel = {
- new BIDMachLRModel(sc, pd, sap.regParam)
- }
-
- // Prediction method for trained model.
- def predict(model: NativeLRModel, query: Query): PredictedResult = {
- model.predict(query.text)
- }
-
-}
-
- class BIDMachLRModel (
- sc : SparkContext,
- pd : PreparedData,
- regParam : Double
- ) extends Serializable with NativeLRModel {
-
- private val labels: Seq[Double] = pd.categoryMap.keys.toSeq
-
- val data = prepareDataFrame(sc, pd, labels)
-
- private val lrModels = fitLRModels
-
- def fitLRModels:Seq[(Double, LREstimate)] = {
-
- Mat.checkMKL
- Mat.checkCUDA
- if (Mat.hasCUDA > 0) GPUmem
-
- // 3. Create a logistic regression model for each class.
- val lrModels: Seq[(Double, LREstimate)] = labels.map(
- label => {
- val lab = label.toInt.toString
-
- val (categories, features) = getFMatsFromData(lab, data)
-
- val mm: Learner = trainGLM(features, FMat(categories))
-
- test(categories, features, mm)
- val modelmat = FMat(mm.modelmat)
- val weightSize = size(modelmat)._2 -1
-
- val weights = modelmat(1,0 to weightSize)
-
- val weightArray = (for(i <- 0 to weightSize -1) yield weights(0,i).toDouble).toArray
-
- // Return (label, feature coefficients, and intercept term.
- (label, LREstimate(weightArray, weights(0,weightSize)))
- }
- )
- lrModels
- }
-
- def predict(text : String): PredictedResult = {
- predict(text, pd, lrModels)
- }
-
- def trainGLM(traindata:SMat, traincats: FMat): Learner = {
- //min(traindata, 1, traindata) // the first "traindata" argument is the input, the other is output
-
- val (mm, mopts) = GLM.learner(traindata, traincats, GLM.logistic)
- mopts.what
-
- mopts.lrate = 0.1
- mopts.reg1weight = regParam
- mopts.batchSize = 1000
- mopts.npasses = 250
- mopts.autoReset = false
- mopts.addConstFeat = true
- mm.train
- mm
- }
-
- def getFMatsFromData(lab: String, data:DataFrame): (FMat, SMat) = {
- val features = data.select(lab, "features")
-
- val sparseVectorsWithRowIndices = (for (r <- features) yield (r.getAs[SparseVector](1), r.getAs[Double](0))).zipWithIndex
-
- val triples = for {
- ((vector, innerLabel), rowIndex) <- sparseVectorsWithRowIndices
- (index, value) <- vector.indices zip vector.values
- } yield ((rowIndex.toInt,index,value), innerLabel)
-
- val catTriples = for {
- ((vector, innerLabel), rowIndex) <- sparseVectorsWithRowIndices
- } yield (rowIndex.toInt,innerLabel.toInt,1.0)
-
- val cats = catTriples
- val feats = triples.map(x => x._1)
-
- val numRows = cats.count().toInt
-
- val catsMat = loadFMatTxt(cats,numRows)
-
- val featsMat = loadFMatTxt(feats,numRows)
-
- println(featsMat)
-
- (full(catsMat), featsMat)
- }
-
- //See https://github.com/BIDData/BIDMat/blob/master/src/main/scala/BIDMat/HMat.scala , method loadDMatTxt
- def loadFMatTxt(cats:RDD[(Int,Int,Double)], nrows: Int):SMat = {
-
- val rows = cats.map(x=> x._1).collect()
- val cols = cats.map(x=> x._2).collect()
- val vals = cats.map(x=> x._3).collect()
-
-
- println("LOADING")
-
- sparse(icol(cols.toList),icol(rows.toList),col(vals.toList))
- }
-
- def test(categories: DMat, features: SMat, mm: Learner): Unit = {
- val testdata = features
- val testcats = categories
-
- //min(testdata, 1, testdata)
-
- val predcats = zeros(testcats.nrows, testcats.ncols)
-
-
-
- val (nn, nopts) = GLM.predictor(mm.model, testdata, predcats)
-
-
-
- nopts.addConstFeat = true
- nn.predict
-
-
- computeAccuracy(FMat(testcats), predcats)
- }
-
- def computeAccuracy(testcats: FMat, predcats: FMat): Unit = {
- //println(testcats)
- //println(predcats)
-
- val lacc = (predcats ∙→ testcats + (1 - predcats) ∙→ (1 - testcats)) / predcats.ncols
- lacc.t
- println(mean(lacc))
- }
-
-}
diff --git a/src/main/scala/org/template/textclassification/Engine.scala b/src/main/scala/org/template/textclassification/Engine.scala
deleted file mode 100644
index 5002928..0000000
--- a/src/main/scala/org/template/textclassification/Engine.scala
+++ /dev/null
@@ -1,51 +0,0 @@
-package org.template.textclassification
-
-import io.prediction.controller._
-
-
-
-// 1. Define Query class which serves as a wrapper for
-// new text data.
-class Query(
- val text: String
-) extends Serializable
-
-
-
-// 2. Define PredictedResult class which serves as a
-// wrapper for a predicted class label and the associated
-// prediction confidence.
-case class PredictedResult (
- val category: String,
- val confidence: Double
-) extends Serializable
-
-
-
-
-
-// 3. Define ActualResult class which serves as a wrapper
-// for an observation's true class label.
-class ActualResult(
- val category: String
-) extends Serializable
-
-
-
-// 4. Initialize the engine.
-object TextClassificationEngine extends EngineFactory {
- override
- def apply() = {
- new Engine(
- classOf[DataSource],
- classOf[Preparator],
- Map(
- "VWlogisticSGD" -> classOf[VowpalLogisticRegressionWithSGDAlgorithm],
- "nb" -> classOf[NBAlgorithm],
- "lr" -> classOf[LRAlgorithm],
- "bid-lr" -> classOf[BIDMachLRAlgorithm]
- ), classOf[Serving]
- )
- }
-}
-
diff --git a/src/main/scala/org/template/textclassification/LRAlgorithm.scala b/src/main/scala/org/template/textclassification/LRAlgorithm.scala
deleted file mode 100644
index f8cae74..0000000
--- a/src/main/scala/org/template/textclassification/LRAlgorithm.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-package org.template.textclassification
-
-import java.io._
-
-import BIDMat.{DMat, Mat}
-import io.prediction.controller.Params
-import io.prediction.controller.P2LAlgorithm
-import io.prediction.workflow.FakeRun
-import org.apache.spark.SparkContext
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.mllib.linalg.SparseVector
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.functions
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.UserDefinedFunction
-import com.github.fommil.netlib.F2jBLAS
-import org.template.textclassification.NativeLRModel
-
-
-import scala.math._
-
-
-case class LRAlgorithmParams (
- regParam : Double
-) extends Params
-
-
-class LRAlgorithm(
- val sap: LRAlgorithmParams
-) extends P2LAlgorithm[PreparedData, LRModel, Query, PredictedResult] {
-
- // Train your model.
- def train(sc: SparkContext, pd: PreparedData): LRModel = {
- new LRModel(sc, pd, sap.regParam)
- }
-
- // Prediction method for trained model.
- def predict(model: LRModel, query: Query): PredictedResult = {
- model.predict(query.text)
- }
-}
-
-class LRModel (
- sc : SparkContext,
- pd : PreparedData,
- regParam : Double
-) extends Serializable with NativeLRModel {
- private val labels: Seq[Double] = pd.categoryMap.keys.toSeq
-
- val data = prepareDataFrame(sc, pd, labels)
-
- private val lrModels = fitLRModels
-
- def fitLRModels:Seq[(Double, LREstimate)] = {
- val lr = new LogisticRegression()
- .setMaxIter(10)
- .setThreshold(0.5)
- .setRegParam(regParam)
-
- // 3. Create a logistic regression model for each class.
- val lrModels: Seq[(Double, LREstimate)] = labels.map(
- label => {
- val lab = label.toInt.toString
-
- //val (categories, features) = getDMatsFromData(lab)
-
-
- val fit = lr.setLabelCol(lab).fit(
- data.select(lab, "features")
- )
-
-
- // Return (label, feature coefficients, and intercept term.
- (label, LREstimate(fit.weights.toArray, fit.intercept))
-
- }
- )
- lrModels
- }
-
- def predict(text : String): PredictedResult = {
- predict(text, pd, lrModels)
- }
-
-
-}
-
-
diff --git a/src/main/scala/org/template/textclassification/NBAlgorithm.scala b/src/main/scala/org/template/textclassification/NBAlgorithm.scala
deleted file mode 100644
index a89c013..0000000
--- a/src/main/scala/org/template/textclassification/NBAlgorithm.scala
+++ /dev/null
@@ -1,94 +0,0 @@
-package org.template.textclassification
-
-import io.prediction.controller.P2LAlgorithm
-import io.prediction.controller.Params
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.classification.NaiveBayes
-import org.apache.spark.mllib.classification.NaiveBayesModel
-import org.apache.spark.mllib.linalg.Vector
-import com.github.fommil.netlib.F2jBLAS
-
-import scala.math._
-
-// 1. Define parameters for Supervised Learning Model. We are
-// using a Naive Bayes classifier, which gives us only one
-// hyperparameter in this stage.
-
-case class NBAlgorithmParams(
- lambda: Double
-) extends Params
-
-
-
-// 2. Define SupervisedAlgorithm class.
-
-class NBAlgorithm(
- val sap: NBAlgorithmParams
-) extends P2LAlgorithm[PreparedData, NBModel, Query, PredictedResult] {
-
- // Train your model.
- def train(sc: SparkContext, pd: PreparedData): NBModel = {
- new NBModel(pd, sap.lambda)
- }
-
- // Prediction method for trained model.
- def predict(model: NBModel, query: Query): PredictedResult = {
- model.predict(query.text)
- }
-}
-
-class NBModel(
-val pd: PreparedData,
-lambda: Double
-) extends Serializable {
-
-
-
- // 1. Fit a Naive Bayes model using the prepared data.
-
- private val nb : NaiveBayesModel = NaiveBayes.train(
- pd.transformedData.map(x=>x.point), lambda)
-
-
-
- // 2. Set up linear algebra framework.
-
- private def innerProduct (x : Array[Double], y : Array[Double]) : Double = {
- x.zip(y).map(e => e._1 * e._2).sum
- }
-
- val normalize = (u: Array[Double]) => {
- val uSum = u.sum
-
- u.map(e => e / uSum)
- }
-
-
-
- private val scoreArray = nb.pi.zip(nb.theta)
-
- // 3. Given a document string, return a vector of corresponding
- // class membership probabilities.
-
- private def getScores(doc: String): Array[Double] = {
- // Helper function used to normalize probability scores.
- // Returns an object of type Array[Double]
-
- // Vectorize query,
- val x: Vector = pd.transform(doc).vector
-
- val z = scoreArray
- .map(e => innerProduct(e._2, x.toArray) + e._1)
-
- normalize((0 until z.size).map(k => exp(z(k) - z.max)).toArray)
- }
-
- // 4. Implement predict method for our model using
- // the prediction rule given in tutorial.
-
- def predict(doc : String) : PredictedResult = {
- val x: Array[Double] = getScores(doc)
- val y: (Double, Double) = (nb.labels zip x).maxBy(_._2)
- new PredictedResult(pd.categoryMap.getOrElse(y._1, ""), y._2)
- }
-}
\ No newline at end of file
diff --git a/src/main/scala/org/template/textclassification/NativeLRModel.scala b/src/main/scala/org/template/textclassification/NativeLRModel.scala
deleted file mode 100644
index feb2e53..0000000
--- a/src/main/scala/org/template/textclassification/NativeLRModel.scala
+++ /dev/null
@@ -1,69 +0,0 @@
-package org.template.textclassification
-
-import java.io.Serializable
-
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.{functions, UserDefinedFunction, SQLContext, DataFrame}
-
-import scala.math._
-
-/**
- * Created by burtn on 15/07/15.
- */
-trait NativeLRModel {
- case class LREstimate (
- coefficients : Array[Double],
- intercept : Double
- ) extends Serializable
-
-
- def fitLRModels:Seq[(Double, LREstimate)]
-
- def predict(text: String) : PredictedResult
-
- def prepareDataFrame(sc : SparkContext, pd : PreparedData, labels: Seq[Double]): DataFrame = {
- // 1. Import SQLContext for creating DataFrame.
- val sql: SQLContext = new SQLContext(sc)
- import sql.implicits._
-
- // 2. Initialize logistic regression model with regularization parameter.
-
- labels.foldLeft(pd.transformedData.map(x => x.point).toDF)(//transform to Spark DataFrame
-
- // Add the different binary columns for each label.
- (data: DataFrame, label: Double) => {
- // function: multiclass labels --> binary labels
- val f: UserDefinedFunction = functions.udf((e: Double) => if (e == label) 1.0 else 0.0)
-
- data.withColumn(label.toInt.toString, f(data("label")))
- }
- )
- }
-
- // 4. Enable vector inner product for prediction.
-
- private def innerProduct (x : Array[Double], y : Array[Double]) : Double = {
- x.zip(y).map(e => e._1 * e._2).sum
- }
-
- // 5. Define prediction rule.
- def predict(text : String, pd : PreparedData,lrModels:Seq[(Double, LREstimate)]): PredictedResult = {
- val x : Array[Double] = pd.transform(text).vector.toArray
-
- // Logistic Regression binary formula for positive probability.
- // According to MLLib documentation, class labeled 0 is used as pivot.
- // Thus, we are using:
- // log(p1/p0) = log(p1/(1 - p1)) = b0 + xTb =: z
- // p1 = exp(z) * (1 - p1)
- // p1 * (1 + exp(z)) = exp(z)
- // p1 = exp(z)/(1 + exp(z))
- val pred = lrModels.map(
- e => {
- val z = exp(innerProduct(e._2.coefficients, x) + e._2.intercept)
- (e._1, z / (1 + z))
- }
- ).maxBy(_._2)
-
- PredictedResult(pd.categoryMap(pred._1), pred._2)
- }
-}
\ No newline at end of file
diff --git a/src/main/scala/org/template/textclassification/Preparator.scala b/src/main/scala/org/template/textclassification/Preparator.scala
deleted file mode 100644
index d55fd64..0000000
--- a/src/main/scala/org/template/textclassification/Preparator.scala
+++ /dev/null
@@ -1,198 +0,0 @@
-package org.template.textclassification
-
-
-import io.prediction.controller.PPreparator
-import io.prediction.controller.Params
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.feature.{IDF, IDFModel, HashingTF}
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.linalg.distributed._
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
-
-import scala.collection.Map
-import scala.collection.immutable.HashMap
-import scala.collection.JavaConversions._
-import scala.math._
-
-
-// 1. Initialize Preparator parameters. Recall that for our data
-// representation we are only required to input the n-gram window
-// components.
-
-case class PreparatorParams(
- nGram: Int,
- numFeatures: Int = 5000,
- SPPMI: Boolean
-) extends Params
-
-case class VectorAndTextExample(
- vector: SparseVector,
- text : String
- ) extends Serializable
-
-case class LabeledPointAndTextExample(
- point: LabeledPoint,
- text : String
- ) extends Serializable
-
-
-// 2. Initialize your Preparator class.
-
-class Preparator(pp: PreparatorParams) extends PPreparator[TrainingData, PreparedData] {
-
- // Prepare your training data.
- def prepare(sc : SparkContext, td: TrainingData): PreparedData = {
- new PreparedData(td, pp.nGram, pp.numFeatures, pp.SPPMI, sc)
- }
-}
-
-//------PreparedData------------------------
-
-class PreparedData(
- val td: TrainingData,
- val nGram: Int,
- val numFeatures: Int,
- val SPPMI: Boolean,
- @transient val sc: SparkContext
-) extends Serializable {
-
- // 1. Hashing function: Text -> term frequency vector.
-
- private val hasher = new HashingTF(numFeatures = numFeatures)
-
-
- def transform(text: String): VectorAndTextExample ={
- return if(SPPMI) transformSPPMI(text) else transformTFIDF(text)
- }
-
- val idf : IDFModel = new IDF().fit(td.data.map(e => hashTF(e.text)))
-
-
- //3. Document Transformer: text => tf-idf vector.
-
- private def transformTFIDF(text : String): VectorAndTextExample = {
- // Map(n-gram -> document tf)
- val result = VectorAndTextExample(idf.transform(hashTF(text)).toSparse, text)
- //println(result)
- result
- }
-
- val ppmiMap = generateSPPMIMatrix(td,sc).collectAsMap()
- println(ppmiMap.head._2.size)
- println(ppmiMap.head)
-
-
- private def hashTF(text: String): Vector = {
- val newList: Array[String] = text.split(" ")
- .sliding(nGram)
- .map(_.mkString)
- .toArray
-
- hasher.transform(newList)
- }
-
- private def transformSPPMI(text : String): VectorAndTextExample = {
- // Map(n-gram -> document tf)
-
- val result = VectorAndTextExample(ppmiMap(text), text)
- //println(result)
- result
- }
-
-
- private def calculateSPPMI(localMat: Matrix, N: Long, k: Int): IndexedSeq[MatrixEntry] = {
- //println(localMat)
- val pmiMatrixEntries = for (i <- 0 until localMat.numCols; j <- 0 until localMat.numRows)
- yield {
- new MatrixEntry(j, i, math.max(0, math.log(localMat(j, i) * N / (localMat(i, i) * localMat(j, j))) / math.log(2.0) - math.log(k) / math.log(2.0)))
- }
- return pmiMatrixEntries
- }
-
- private def generateSPPMIMatrix(trainData: TrainingData, sc:SparkContext) : RDD[(String,SparseVector)] = {
- val (hashedFeats: RDD[Vector], mat: IndexedRowMatrix, cooccurrences: Matrix) = computeCooccurrences(trainData)
-
- val k = 10
- val pmiEntries = calculateSPPMI(cooccurrences , mat.numRows, k)
- val pmiMat: CoordinateMatrix = new CoordinateMatrix(sc.parallelize(pmiEntries))
- val indexedPMIMat = pmiMat.toIndexedRowMatrix()
-
- //val principalComponents = indexedPMIMat.toRowMatrix().computePrincipalComponents(500)
- //val pcPMImat = indexedPMIMat.multiply(principalComponents)
-
- println(trainData.data.count())
- println(indexedPMIMat.numCols())
-// println(pcPMImat.numCols())
-
- val pmiMatRows = indexedPMIMat.rows.map(e=> e.index -> e.vector).collectAsMap()
-
- return generateTextToSPPMIVectorMap(trainData, hashedFeats, pmiMatRows)
- }
- private def generateTextToSPPMIVectorMap(trainData: TrainingData, hashedFeats: RDD[Vector], pmiMatRows: Map[Long, Vector]): RDD[(String, SparseVector)] = {
- //TODO: take into account feature counts, currently it's on/off
- //also not use var
- val composedWordVectors = for (v <- hashedFeats)
- yield {
- var ar = Array.fill[Double](pmiMatRows.head._2.size)(0)
- for (i <- 0 until v.size; if v(i) > 0) {
- //Additive
- //ar = (ar,pmiMatRows(i).toArray).zipped.map(_ + _)
-
- //Appending
- ar = ar ++ pmiMatRows(i).toArray
- }
-
- Vectors.dense(ar.map(x => x)).toSparse
- }
-
- val textToSPPMIVectorMap = (trainData.data.map(x => x.text) zip composedWordVectors)
- textToSPPMIVectorMap
- }
-
- private def computeCooccurrences(trainData: TrainingData): (RDD[Vector], IndexedRowMatrix, Matrix) = {
- val hashedFeats = trainData.data.map(e => hashTF(e.text))
-
- val rows = hashedFeats.map( x =>
- x.toArray.map( value => if (value > 0) 1.0 else 0.0)).map( y => Vectors.dense(y).toSparse)
-
- val indexedRows = rows.zipWithIndex.map(x => new IndexedRow(x._2, x._1))
-
- val mat = new IndexedRowMatrix(indexedRows)
-
-
- //println(mat.toBlockMatrix().toLocalMatrix())
-
- //println(blockMat.numCols())
- //println(blockMat.numRows())
-
- val cooccurrences = mat.computeGramianMatrix()
- //Alternatively:
- //val cooccurrences = blockMat.transpose.multiply(blockMat)
- (hashedFeats, mat, cooccurrences)
- }
-
-
-
-
-
-
-
-
- // 4. Data Transformer: RDD[documents] => RDD[LabeledPoints]
-
- val transformedData: RDD[LabeledPointAndTextExample] = {
- td.data.map(e => LabeledPointAndTextExample(LabeledPoint(e.label, transform(e.text).vector), e.text))
- }
-
-
- // 5. Finally extract category map, associating label to category.
- val categoryMap = td.data.map(e => (e.label, e.category)).collectAsMap
-
-
-}
-
-
-
-
diff --git a/src/main/scala/org/template/textclassification/Serving.scala b/src/main/scala/org/template/textclassification/Serving.scala
deleted file mode 100644
index b41389e..0000000
--- a/src/main/scala/org/template/textclassification/Serving.scala
+++ /dev/null
@@ -1,13 +0,0 @@
-package org.template.textclassification
-
-import io.prediction.controller.LServing
-
-// 1. Define serving component.
-class Serving extends LServing[Query, PredictedResult] {
-
- override
- def serve(query: Query, predictedResults: Seq[PredictedResult]):
- PredictedResult = predictedResults.maxBy(e => e.confidence)
-}
-
-
diff --git a/src/main/scala/org/template/textclassification/VowpalLogisticRegressionWithSGDAlgorithm.scala b/src/main/scala/org/template/textclassification/VowpalLogisticRegressionWithSGDAlgorithm.scala
deleted file mode 100644
index c40b187..0000000
--- a/src/main/scala/org/template/textclassification/VowpalLogisticRegressionWithSGDAlgorithm.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-package org.template.textclassification
-
-import io.prediction.controller.P2LAlgorithm
-import io.prediction.controller.Params
-
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg.Vector
-import grizzled.slf4j.Logger
-
-import java.nio.file.{Files, Paths}
-
-import vw.VW
-
-case class AlgorithmParams(
- maxIter: Int,
- regParam: Double,
- stepSize: Double,
- bitPrecision: Int,
- modelName: String,
- namespace: String,
- ngram: Int
-) extends Params
-
-// extends P2LAlgorithm because VW doesn't contain RDD.
-class VowpalLogisticRegressionWithSGDAlgorithm(val ap: AlgorithmParams)
- extends P2LAlgorithm[PreparedData, Array[Byte], Query, PredictedResult] {
-
- @transient lazy val logger = Logger[this.type]
-
- def train(sc: SparkContext, data: PreparedData): Array[Byte] = {
-
- require(!data.td.data.take(1).isEmpty,
- s"RDD[labeldPoints] in PreparedData cannot be empty." +
- " Please check if DataSource generates TrainingData" +
- " and Preprator generates PreparedData correctly.")
-
- val reg = "--l2 " + ap.regParam
- //val iters = "-c -k --passes " + ap.maxIter
- val lrate = "-l " + ap.stepSize
- val ngram = "--ngram " + ap.ngram
-
- val vw = new VW("--loss_function logistic --invert_hash readable.model -b " + ap.bitPrecision + " " + "-f " + ap.modelName + " " + reg + " " + lrate + " " + ngram)
-
- val inputs = for (point <- data.transformedData.collect) yield (if (point.point.label.toDouble == 0.0) "-1.0" else "1.0") + " |" + ap.namespace + " " + rawTextToVWFormattedString(point.text) + " " + vectorToVWFormattedString(point.point.features)
-
- //val inputs = for (point <- data.transformedData) yield (if (point.label.toDouble == 0.0) "-1.0" else "1.0") + " |" + ap.namespace + " " + rawTextToVWFormattedString(point.)
-
- //Regressing
- //val inputs = for (point <- data.td.data) yield point.category.toDouble.toString + " |" + ap.namespace + " " + rawTextToVWFormattedString(point.text)
-
-
- //for (item <- inputsCollected) logger.info(item)
-
- val results = for (item <- inputs) yield vw.learn(item)
-
- val matchOnTrainSet = for (item <- inputs) yield item.startsWith(if(vw.predict(item).toDouble > 0.5) "1" else "-1")
-
-
- val acc = (for (x <- matchOnTrainSet) yield if(x) 1 else 0).sum.toDouble / matchOnTrainSet.size
- println("Accuracy on Training set: " + acc)
-
- vw.close()
-
- Files.readAllBytes(Paths.get(ap.modelName))
- }
-
- def predict(byteArray: Array[Byte], query: Query): PredictedResult = {
- Files.write(Paths.get(ap.modelName), byteArray)
-
- val vw = new VW("--link logistic -i " + ap.modelName)
- val pred = vw.predict("|" + ap.namespace + " " + rawTextToVWFormattedString(query.text)).toDouble
- vw.close()
-
- val category = (if(pred > 0.5) 1 else 0).toString
- val prob = (if(pred > 0.5) pred else 1.0 - pred)
- val result = new PredictedResult(category, prob)
-
- result
- }
-
- def rawTextToVWFormattedString(str: String) : String = {
- //VW input cannot contain these characters
- str.replaceAll("[|:]", " ")
- }
-
- def vectorToVWFormattedString(vec: Vector): String = {
- vec.toArray.zipWithIndex.map{ case (dbl, int) => s"$int:$dbl"} mkString " "
- }
-
-}
diff --git a/train.sh b/train.sh
deleted file mode 100755
index cc47b8b..0000000
--- a/train.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# export JAVA_HOME="" # Set here if not set in environment
-# export CUDA_PATH="" # Set here if not set in environment
-JCUDA_VERSION="0.7.0a" # Fix if needed
-MEMSIZE="-Xmx14G"
-export JAVA_OPTS="${MEMSIZE} -Xms128M -Dfile.encoding=UTF-8" # Set as much memory as possible
-BIDMACH_ROOT="${BASH_SOURCE[0]}"
-if [ ! `uname` = "Darwin" ]; then
- BIDMACH_ROOT=`readlink -f "${BIDMACH_ROOT}"`
-else
- while [ -L "${BIDMACH_ROOT}" ]; do
- BIDMACH_ROOT=`readlink "${BIDMACH_ROOT}"`
- done
-fi
-BIDMACH_ROOT=`dirname "$BIDMACH_ROOT"`
-pushd "${BIDMACH_ROOT}" > /dev/null
-BIDMACH_ROOT=`pwd`
-BIDMACH_ROOT="$( echo ${BIDMACH_ROOT} | sed s+/cygdrive/c+c:+ )"
-JCUDA_LIBDIR="${BIDMACH_ROOT}/lib"
-LIBDIR="${BIDMACH_ROOT}/lib"
-if [ -e java_native_path.txt ]; then
- JAVA_NATIVE=`cat java_native_path.txt`
-else
- JAVA_NATIVE=`java getnativepath`
- echo ${JAVA_NATIVE} > java_native_path.txt
-fi
-if [ `uname` = "Darwin" ]; then
- export DYLD_LIBRARY_PATH="${LIBDIR}:/usr/local/cuda/lib:${DYLD_LIBRARY_PATH}"
-else
- export LD_LIBRARY_PATH="${LIBDIR}:${LIBDIR}/cuda:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
-fi
-export JAVA_NATIVE=${JAVA_NATIVE}:${LD_LIBRARY_PATH}:${DYLD_LIBRARY_PATH}:
-popd > /dev/null
-if [ "$OS" = "Windows_NT" ]; then
- if [ ! "${JAVA_HOME}" = "" ]; then
- JAVA_HOME=`${BIDMACH_ROOT}/shortpath.bat "${JAVA_HOME}"`
- export JAVA_HOME=`echo ${JAVA_HOME} | sed 's_\\\\_/_g'`/bin
- fi
-fi
-
-BIDMACH_LIBS="${LIBDIR}/BIDMat.jar;${LIBDIR}/ptplot.jar;${LIBDIR}/ptplotapplication.jar;${LIBDIR}/jhdf5.jar;${LIBDIR}/commons-math3-3.2.jar;${LIBDIR}/lz4-1.3.jar"
-
-JCUDA_LIBS="${JCUDA_LIBDIR}/jcuda-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcublas-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcufft-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcurand-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcusparse-${JCUDA_VERSION}.jar"
-
-ALL_LIBS=";${LIBDIR}/IScala.jar;${BIDMACH_ROOT}/BIDMach.jar;${BIDMACH_LIBS};${JCUDA_LIBS};${JAVA_HOME}/lib/tools.jar"
-
-if [ "$OS" = "Windows_NT" ]; then
- if [ ! "${CUDA_PATH}" = "" ]; then
- NEWPATH=`${BIDMACH_ROOT}/shortpath.bat "${CUDA_PATH}"`
- NEWPATH=`echo ${NEWPATH} | sed 's_\\\\_/_g'`/bin
- fi
- DJAVA_NATIVE="-Djava.library.path=${LIBDIR};${NEWPATH}"
-else
- ALL_LIBS=`echo "${ALL_LIBS}" | sed 's/;/:/g'`
- DJAVA_NATIVE="-Djava.library.path=${JAVA_NATIVE}"
-fi
-if [ ! `uname` = "Darwin" ]; then
- export JAVA_OPTS="${DJAVA_NATIVE} ${JAVA_OPTS}"
-fi
-
-pio train -- --driver-memory 16g --executor-memory 8g --conf spark.driver.maxResultSize=3g --conf spark.akka.frameSize=2047