blob: 2941357acb7293b66a7ffbb81d70ef8e7024f17b [file] [log] [blame]
package org.apache.s2graph.s2jobs.wal.utils
import java.net.{URI, URLDecoder}
import scala.util.matching.Regex
object UrlUtils {
val pattern = new Regex("""(\\x[0-9A-Fa-f]{2}){3}""")
val koreanPattern = new scala.util.matching.Regex("([가-힣]+[\\-_a-zA-Z 0-9]*)+|([\\-_a-zA-Z 0-9]+[가-힣]+)")
// url extraction functions
def urlDecode(url: String): (Boolean, String) = {
try {
val decoded = URLDecoder.decode(url, "UTF-8")
(url != decoded, decoded)
} catch {
case e: Exception => (false, url)
}
}
def hex2String(url: String): String = {
pattern replaceAllIn(url, m => {
new String(m.toString.replaceAll("[^0-9A-Fa-f]", "").sliding(2, 2).toArray.map(Integer.parseInt(_, 16).toByte), "utf-8")
})
}
def toDomains(url: String, maxDepth: Int = 3): Seq[String] = {
val uri = new URI(url)
val domain = uri.getHost
if (domain == null) Nil
else {
val paths = uri.getPath.split("/")
if (paths.isEmpty) Seq(domain)
else {
val depth = Math.min(maxDepth, paths.size)
(1 to depth).map { ith =>
domain + paths.take(ith).mkString("/")
}
}
}
}
def extract(_url: String): (String, Seq[String], Option[String]) = {
try {
val url = hex2String(_url)
val (encoded, decodedUrl) = urlDecode(url)
val kwdOpt = koreanPattern.findAllMatchIn(decodedUrl).toList.map(_.group(0)).headOption.map(_.replaceAll("\\s", ""))
val domains = toDomains(url.replaceAll(" ", ""))
(decodedUrl, domains, kwdOpt)
} catch {
case e: Exception => (_url, Nil, None)
}
}
}