Skip to main content

Examples

Basic Usage

Extracting URLs with default options:

import io.lambdaworks.detection.UrlDetector
import io.lemonlabs.uri.{Host, AbsoluteUrl}

val detector: UrlDetector = UrlDetector.default
// detector: UrlDetector = io.lambdaworks.detection.UrlDetector@2152141
val text = "Visit https://example.com and www.lambdaworks.io for more info"
// text: String = "Visit https://example.com and www.lambdaworks.io for more info"
val extractedUrls: Set[AbsoluteUrl] = detector.extract(text)
// extractedUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "www.lambdaworks.io"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

extractedUrls.foreach(println)
// https://example.com
// http://www.lambdaworks.io

Host Filtering

Allowing Specific Hosts

Extract URLs only from specific domains:

val allowedDetector = UrlDetector.default.withAllowed(Host.parse("lambdaworks.io"))
// allowedDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@1665bc1b
val mixedText = "Check out lambdaworks.io and example.com"
// mixedText: String = "Check out lambdaworks.io and example.com"
val allowedUrls = allowedDetector.extract(mixedText)
// allowedUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "lambdaworks.io"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

// Only returns lambdaworks.io
allowedUrls.foreach(println)
// http://lambdaworks.io

Denying Specific Hosts

Exclude URLs from specific domains:

val deniedDetector = UrlDetector.default.withDenied(Host.parse("ads.example.com"))
// deniedDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@7c2ff2e
val adsText = "Visit example.com but not ads.example.com"
// adsText: String = "Visit example.com but not ads.example.com"
val filteredUrls = deniedDetector.extract(adsText)
// filteredUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

// Returns example.com but not ads.example.com
filteredUrls.foreach(println)
// http://example.com

Multiple Allowed Hosts

Allow multiple domains:

val multiAllowed = UrlDetector.default
.withAllowed(
Host.parse("lambdaworks.io"),
Host.parse("github.com"),
Host.parse("scala-lang.org")
)
// multiAllowed: UrlDetector = io.lambdaworks.detection.UrlDetector@728cf334

val techText = "Visit lambdaworks.io, github.com, example.com, and scala-lang.org"
// techText: String = "Visit lambdaworks.io, github.com, example.com, and scala-lang.org"
val techUrls = multiAllowed.extract(techText)
// techUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "lambdaworks.io"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "github.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "scala-lang.org"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

// Returns only allowed domains
techUrls.foreach(println)
// http://lambdaworks.io
// http://github.com
// http://scala-lang.org

Format-Specific Detection

JSON Content

Extracting URLs from JSON:

import io.lambdaworks.detection.UrlDetectorOptions

val jsonDetector = UrlDetector(UrlDetectorOptions.Json)
// jsonDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@57c98a24
val jsonContent = """
{
"api": "https://api.example.com/v1",
"docs": "https://docs.example.com",
"links": ["https://github.com/example", "https://twitter.com/example"]
}
"""
// jsonContent: String = """
// {
// "api": "https://api.example.com/v1",
// "docs": "https://docs.example.com",
// "links": ["https://github.com/example", "https://twitter.com/example"]
// }
// """
val jsonUrls = jsonDetector.extract(jsonContent)
// jsonUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "api.example.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("v1")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "docs.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "github.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("example")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "twitter.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("example")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

jsonUrls.foreach(println)
// https://api.example.com/v1
// https://docs.example.com
// https://github.com/example
// https://twitter.com/example

HTML Content

Extracting URLs from HTML:

val htmlDetector = UrlDetector(UrlDetectorOptions.Html)
// htmlDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@ee18d1e
val htmlContent = """
<html>
<a href="https://example.com">Link</a>
<img src="https://cdn.example.com/image.png">
<script>fetch("https://api.example.com/data")</script>
</html>
"""
// htmlContent: String = """
// <html>
// <a href="https://example.com">Link</a>
// <img src="https://cdn.example.com/image.png">
// <script>fetch("https://api.example.com/data")</script>
// </html>
// """
val htmlUrls = htmlDetector.extract(htmlContent)
// htmlUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "cdn.example.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("image.png")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "api.example.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("data")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

htmlUrls.foreach(println)
// https://example.com
// https://cdn.example.com/image.png
// https://api.example.com/data

JavaScript Code

Extracting URLs from JavaScript:

val jsDetector = UrlDetector(UrlDetectorOptions.Javascript)
// jsDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@227b35c5
val jsCode = """
const API_URL = 'https://api.example.com';
const CDN = "https://cdn.example.com";
fetch('https://data.example.com/users')
.then(res => res.json())
"""
// jsCode: String = """
// const API_URL = 'https://api.example.com';
// const CDN = "https://cdn.example.com";
// fetch('https://data.example.com/users')
// .then(res => res.json())
// """
val jsUrls = jsDetector.extract(jsCode)
// jsUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "api.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "cdn.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "data.example.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("users")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

jsUrls.foreach(println)
// https://api.example.com
// https://cdn.example.com
// https://data.example.com/users

XML Content

Extracting URLs from XML:

val xmlDetector = UrlDetector(UrlDetectorOptions.Xml)
// xmlDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@2aaee1ae
val xmlContent = """<?xml version="1.0"?>
<config>
<endpoint>https://api.example.com</endpoint>
<resource href="https://cdn.example.com/data"/>
</config>
"""
// xmlContent: String = """<?xml version="1.0"?>
// <config>
// <endpoint>https://api.example.com</endpoint>
// <resource href="https://cdn.example.com/data"/>
// </config>
// """
val xmlUrls = xmlDetector.extract(xmlContent)
// xmlUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "api.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "cdn.example.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("data")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

xmlUrls.foreach(println)
// https://api.example.com
// https://cdn.example.com/data

Advanced Scenarios

URLs in Brackets

Detecting URLs in Markdown-style links:

val bracketDetector = UrlDetector(UrlDetectorOptions.BracketMatch)
// bracketDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@4e225637
val markdownText = """
See [https://docs.example.com] for documentation.
Also check (https://github.com/example) and {https://support.example.com}
"""
// markdownText: String = """
// See [https://docs.example.com] for documentation.
// Also check (https://github.com/example) and {https://support.example.com}
// """
val bracketUrls = bracketDetector.extract(markdownText)
// bracketUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "docs.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "github.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("example")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "support.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

bracketUrls.foreach(println)
// https://docs.example.com
// https://github.com/example
// https://support.example.com

Single-Level Domains

Allowing localhost and internal domains:

val localDetector = UrlDetector(UrlDetectorOptions.AllowSingleLevelDomain)
// localDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@57fcca6d
val devText = """
Development: http://localhost:8080
Internal: http://intranet
Go link: go/documentation
"""
// devText: String = """
// Development: http://localhost:8080
// Internal: http://intranet
// Go link: go/documentation
// """
val localUrls = localDetector.extract(devText)
// localUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "localhost"),
// port = Some(value = 8080)
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "intranet"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "go"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("documentation")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

localUrls.foreach(println)
// http://localhost:8080
// http://intranet
// http://go/documentation

Combining Options with Filtering

Using specific detection mode with host filtering:

val combinedDetector = UrlDetector(UrlDetectorOptions.Html)
.withAllowed(Host.parse("cdn.example.com"), Host.parse("api.example.com"))
.withDenied(Host.parse("ads.cdn.example.com"))
// combinedDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@5ca34a5b

val complexHtml = """
<html>
<link href="https://cdn.example.com/styles.css">
<script src="https://ads.cdn.example.com/tracker.js"></script>
<img src="https://api.example.com/images/logo.png">
<a href="https://other.example.com">External</a>
</html>
"""
// complexHtml: String = """
// <html>
// <link href="https://cdn.example.com/styles.css">
// <script src="https://ads.cdn.example.com/tracker.js"></script>
// <img src="https://api.example.com/images/logo.png">
// <a href="https://other.example.com">External</a>
// </html>
// """
val combinedUrls = combinedDetector.extract(complexHtml)
// combinedUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "cdn.example.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("styles.css")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "api.example.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("images", "logo.png")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

// Returns cdn.example.com and api.example.com, but excludes ads.cdn.example.com and other.example.com
combinedUrls.foreach(println)
// https://cdn.example.com/styles.css
// https://api.example.com/images/logo.png

Working with Extracted URLs

Accessing URL Components

The extracted URLs are AbsoluteUrl instances from scala-uri:

val urlComponentsDetector = UrlDetector.default
// urlComponentsDetector: UrlDetector = io.lambdaworks.detection.UrlDetector@2152141
val componentUrls = urlComponentsDetector.extract("https://api.example.com:8080/v1/users?page=1#top")
// componentUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "api.example.com"),
// port = Some(value = 8080)
// ),
// path = AbsolutePath(parts = Vector("v1", "users")),
// query = QueryString(params = Vector(("page", Some(value = "1")))),
// fragment = Some(value = "top")
// )
// )

componentUrls.foreach { url =>
println(s"Scheme: ${url.schemeOption}")
println(s"Host: ${url.host}")
println(s"Port: ${url.port}")
println(s"Path: ${url.path}")
println(s"Query: ${url.query}")
println(s"Fragment: ${url.fragment}")
}
// Scheme: Some(https)
// Host: api.example.com
// Port: Some(8080)
// Path: /v1/users
// Query: page=1
// Fragment: Some(top)

Filtering by Scheme

Filter extracted URLs by scheme:

val schemeUrls = UrlDetector.default.extract("Visit https://secure.example.com and http://legacy.example.com")
// schemeUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "secure.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "http",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "legacy.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// )
// )
val httpsOnly = schemeUrls.filter(_.schemeOption.contains("https"))
// httpsOnly: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "secure.example.com"),
// port = None
// ),
// path = ,
// query = QueryString(params = Vector()),
// fragment = None
// )
// )

httpsOnly.foreach(println)
// https://secure.example.com

Grouping by Host

Group URLs by their host:

val multiSiteText = """
Check https://github.com/user1, https://github.com/user2,
https://gitlab.com/project, and https://bitbucket.org/repo
"""
// multiSiteText: String = """
// Check https://github.com/user1, https://github.com/user2,
// https://gitlab.com/project, and https://bitbucket.org/repo
// """
val groupedUrls = UrlDetector.default.extract(multiSiteText)
// groupedUrls: Set[AbsoluteUrl] = Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "github.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("user1")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "github.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("user2")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "gitlab.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("project")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "bitbucket.org"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("repo")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// )
val byHost = groupedUrls.groupBy(_.host)
// byHost: Map[Host, Set[AbsoluteUrl]] = Map(
// DomainName(value = "gitlab.com") -> Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "gitlab.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("project")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// ),
// DomainName(value = "bitbucket.org") -> Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "bitbucket.org"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("repo")),
// query = QueryString(params = Vector()),
// fragment = None
// )
// ),
// DomainName(value = "github.com") -> Set(
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "github.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("user1")),
// query = QueryString(params = Vector()),
// fragment = None
// ),
// AbsoluteUrl(
// scheme = "https",
// authority = Authority(
// userInfo = None,
// host = DomainName(value = "github.com"),
// port = None
// ),
// path = AbsolutePath(parts = Vector("user2")),
// query = QueryString(params = Vector()),
// fragment = None
// ...

byHost.foreach { case (host, urls) =>
println(s"$host: ${urls.size} URLs")
}
// gitlab.com: 1 URLs
// bitbucket.org: 1 URLs
// github.com: 2 URLs