aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schürmann <d.schuermann@2718282.net>2024-03-10 12:22:13 +0100
committerDaniel Schürmann <d.schuermann@2718282.net>2024-03-10 12:22:13 +0100
commite02f41b9b1dc3c45f6626e8b01fee2acb5b905d4 (patch)
tree8fa7087b5f5fcfe5513c97ef7fa6f0fdb9edf614
downloadKOR.addrlink-390f76d388fa183fdf5448219525b79cefe909f6.tar.gz
KOR.addrlink-390f76d388fa183fdf5448219525b79cefe909f6.zip
First cran versionHEADv1.0.1master
-rw-r--r--KOR.addrlink/DESCRIPTION21
-rw-r--r--KOR.addrlink/NAMESPACE4
-rw-r--r--KOR.addrlink/R/addrlink.R111
-rw-r--r--KOR.addrlink/R/helper_split_address.R33
-rw-r--r--KOR.addrlink/R/helper_split_number.R25
-rw-r--r--KOR.addrlink/R/l1score.R8
-rw-r--r--KOR.addrlink/R/match_number.R52
-rw-r--r--KOR.addrlink/R/sanitize_street.R15
-rw-r--r--KOR.addrlink/R/split_address.R10
-rw-r--r--KOR.addrlink/R/split_number.R16
-rw-r--r--KOR.addrlink/data/Adressen.RDatabin0 -> 717568 bytes
-rw-r--r--KOR.addrlink/data/df1.RDatabin0 -> 57256 bytes
-rw-r--r--KOR.addrlink/data/df2.RDatabin0 -> 25164 bytes
-rw-r--r--KOR.addrlink/man/Adressen.Rd21
-rw-r--r--KOR.addrlink/man/KOR.addrlink-package.Rd15
-rw-r--r--KOR.addrlink/man/addrlink.Rd47
-rw-r--r--KOR.addrlink/man/df1.Rd18
-rw-r--r--KOR.addrlink/man/df2.Rd17
-rw-r--r--KOR.addrlink/man/helper_split_address.Rd18
-rw-r--r--KOR.addrlink/man/helper_split_number.Rd18
-rw-r--r--KOR.addrlink/man/l1score.Rd18
-rw-r--r--KOR.addrlink/man/match_number.Rd39
-rw-r--r--KOR.addrlink/man/sanitize_street.Rd28
-rw-r--r--KOR.addrlink/man/split_address.Rd28
-rw-r--r--KOR.addrlink/man/split_number.Rd27
-rw-r--r--KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save81
-rw-r--r--KOR.addrlink/tests/test_l1score.R1
-rw-r--r--KOR.addrlink/tests/test_l1score.Rout.save23
-rw-r--r--KOR.addrlink/tests/test_sanitize_street.R1
-rw-r--r--KOR.addrlink/tests/test_sanitize_street.Rout.save23
-rw-r--r--KOR.addrlink/tests/test_split_address.R2
-rw-r--r--KOR.addrlink/tests/test_split_address.Rout.save29
-rw-r--r--KOR.addrlink/tests/test_split_number.R1
-rw-r--r--KOR.addrlink/tests/test_split_number.Rout.save29
-rw-r--r--KOR.addrlink/vignettes/Example.Rnw143
-rw-r--r--README.html18
36 files changed, 940 insertions, 0 deletions
diff --git a/KOR.addrlink/DESCRIPTION b/KOR.addrlink/DESCRIPTION
new file mode 100644
index 0000000..8761e79
--- /dev/null
+++ b/KOR.addrlink/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: KOR.addrlink
+Type: Package
+Title: Matching Address Data to Reference Index
+Version: 1.0.1
+Date: 2024-03-02
+Author: Daniel Schürmann [aut, cre]
+Authors@R: person("Daniel", "Schürmann", role = c("aut", "cre"), email = "d.schuermann@2718282.net")
+Maintainer: Daniel Schürmann <d.schuermann@2718282.net>
+Depends: R (>= 3.4)
+Imports: stringdist, stringi
+LazyData: true
+Description: Matches a data set with semi-structured address data,
+ e.g., street and house number as a concatenated string,
+ wrongly spelled street names or non-existing house numbers to a
+ reference index. The methods are specifically designed for German
+ municipalities ('KOR'-community) and German address schemes.
+License: GPL-3
+Encoding: UTF-8
+URL: https://git-kor.stadtdo.de
+BugReports: https://git-kor.stadtdo.de/stadt-dortmund/adressdaten/-/issues
+
diff --git a/KOR.addrlink/NAMESPACE b/KOR.addrlink/NAMESPACE
new file mode 100644
index 0000000..6a6a065
--- /dev/null
+++ b/KOR.addrlink/NAMESPACE
@@ -0,0 +1,4 @@
+export("addrlink", "split_address", "split_number")
+importFrom("utils", "head")
+importFrom("stringi", "stri_replace_all_regex", "stri_trans_general", "stri_trans_nfc")
+importFrom("stringdist", "stringsimmatrix")
diff --git a/KOR.addrlink/R/addrlink.R b/KOR.addrlink/R/addrlink.R
new file mode 100644
index 0000000..49deec8
--- /dev/null
+++ b/KOR.addrlink/R/addrlink.R
@@ -0,0 +1,111 @@
+addrlink <-
+function(df_ref, df_match,
+ col_ref = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ fuzzy_threshold = .9, seed = 1234){
+
+ stopifnot(is.data.frame(df_ref), is.data.frame(df_match),
+ is.vector(col_ref), is.character(col_ref),
+ is.vector(col_match), is.character(col_match))
+ stopifnot(unique(colnames(df_ref)) == colnames(df_ref))
+ stopifnot(unique(colnames(df_match)) == colnames(df_match))
+ stopifnot(any(!is.na(col_ref), !is.na(col_match)))
+ stopifnot(length(col_ref) == 3, length(col_match) == 3)
+ stopifnot(col_ref %in% colnames(df_ref))
+ stopifnot(col_match %in% colnames(df_match))
+ stopifnot(fuzzy_threshold < 1, fuzzy_threshold > 0)
+ stopifnot(seed > 0)
+ set.seed(seed)
+
+ Adressen <- df_ref[, col_ref]
+ stopifnot(is.character(Adressen[,1]), is.numeric(Adressen[,2]), is.character(Adressen[,3]))
+ stopifnot(nrow(unique(Adressen)) == nrow(Adressen))
+ stopifnot(nrow(Adressen) > 0)
+ colnames(Adressen) <- c("Strasse", "Hausnummer", "Hausnummernzusatz")
+ Adressen$id.addr <- 1:nrow(Adressen)
+ Adressen$Strasse <- sanitize_street(Adressen$Strasse)
+ Adressen$Hausnummernzusatz <- tolower(Adressen$Hausnummernzusatz)
+
+ df <- df_match[, col_match]
+ stopifnot(is.character(df[,1]), is.numeric(df[,2]),
+ is.character(df[,3]))
+ stopifnot(nrow(df) > 0)
+ colnames(df) <- c("Strasse", "Hausnummer", "Hausnummernzusatz")
+ df$id.df <- 1:nrow(df)
+ df$Strasse <- sanitize_street(df$Strasse)
+ df$Hausnummernzusatz <- tolower(df$Hausnummernzusatz)
+
+ # first pass (direct matches)
+ fp <- merge(x = df, y = Adressen,
+ by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ incomparables = NULL)
+ if(nrow(fp) > 0){
+ fp$qAddress <- 1
+ fp$qscore <- 1
+ } else fp <- data.frame(Strasse = character(), Hausnummer = numeric(),
+ Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(),
+ qAddress = numeric(), qscore = numeric())
+
+ # second pass (street correct)
+ tmp <- df[!(df$id.df %in% fp$id.df) & !is.na(df$Strasse),]
+ sp <- merge(x = tmp, y = unique(Adressen[, "Strasse", drop = FALSE]),
+ by.x = c("Strasse"),
+ by.y = c("Strasse"),
+ incomparables = NULL)
+ if(nrow(sp) > 0){
+ sp <- cbind(id.df = sp$id.df,
+ do.call(rbind, apply(X = sp, MARGIN = 1,
+ FUN = match_number, Adressen = Adressen)))
+ sp$qAddress <- 2
+ sp <- merge(x = sp, y = Adressen,
+ by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ incomparables = NULL)
+ } else sp <- data.frame(Strasse = character(), Hausnummer = numeric(),
+ Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(),
+ qAddress = numeric(), qscore = numeric())
+
+ ## third pass (fuzzy matches)
+ tp <- df[!(df$id.df %in% c(fp$id.df, sp$id.df)) & !is.na(df$Strasse),]
+ uSTR <- unique(Adressen$Strasse)
+ tmp <- stringdist::stringsimmatrix(a = tp$Strasse, b = uSTR, method = "jw",
+ nthread = max(1, floor(parallel::detectCores() / 2)))
+ threshold <- which(apply(tmp, MARGIN = 1, FUN = max) > fuzzy_threshold)
+ if(length(threshold) > 0){
+ tp$Strasse[threshold] <- uSTR[unlist(apply(tmp, MARGIN = 1, FUN = which.max))[threshold]]
+ tp <- merge(x = tp, y = unique(Adressen[, "Strasse", drop = FALSE]),
+ by.x = c("Strasse"),
+ by.y = c("Strasse"),
+ incomparables = NULL)
+ tp <- cbind(id.df = tp$id.df,
+ do.call(rbind, apply(X = tp, MARGIN = 1,
+ FUN = match_number, Adressen = Adressen)))
+ tp$qscore <- tp$qscore * apply((tmp[threshold, , drop = FALSE]), 1, max)
+ tp$qAddress <- 3
+ tp <- merge(x = tp, y = Adressen,
+ by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ incomparables = NULL)
+ } else tp <- data.frame(Strasse = character(), Hausnummer = numeric(),
+ Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(),
+ qAddress = numeric(), qscore = numeric())
+
+ # no match
+ nomatch <- df[!(df$id.df %in% c(fp$id.df, sp$id.df, tp$id.df)),]
+ if(nrow(nomatch) > 0){
+ nomatch$Strasse <- NA
+ nomatch$Hausnummer <- NA
+ nomatch$Hausnummernzusatz <- NA
+ nomatch$id.addr <- NA
+ nomatch$qAddress <- 4
+ nomatch$qscore <- 0
+ } else nomatch <- data.frame(Strasse = character(), Hausnummer = numeric(),
+ Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(),
+ qAddress = numeric(), qscore = numeric())
+
+ # results
+ res <- rbind(fp, sp, tp, nomatch)
+ ret <- cbind(df_match[res$id.df,], df_ref[res$id.addr,])
+ return(list(ret = ret, QA = res[, c("qAddress", "qscore")]))
+}
diff --git a/KOR.addrlink/R/helper_split_address.R b/KOR.addrlink/R/helper_split_address.R
new file mode 100644
index 0000000..e750ae8
--- /dev/null
+++ b/KOR.addrlink/R/helper_split_address.R
@@ -0,0 +1,33 @@
+helper_split_address <-
+function(x, debug = FALSE){
+ if(debug) print(x)
+ x <- trimws(x)
+ x_split <- unlist(strsplit(x, ''))
+ num.idx <- which(x_split %in% as.character(0:9))
+ if(length(num.idx) == 0){ return(list(strasse = x, hnr = NA, hnrz = NA)) }
+ max.num.idx <- max(num.idx)
+ num.extra <- which(x_split %in% c(" ", "-", "/"))
+ num.extra <- num.extra[min(num.idx) < num.extra & max.num.idx > num.extra]
+ num.idx <- c(num.idx, num.extra)
+ idx.subs <- lapply(min(1, (max.num.idx - 7)):max.num.idx, function(x) x:max.num.idx)
+ idx.hnr <- idx.subs[[min(which(unlist(lapply(idx.subs, function(y) all(y %in% num.idx)))))]]
+ hnr <- trimws(substr(x, min(idx.hnr), max(idx.hnr)))
+ if(grepl("-", hnr)){
+ hnr <- as.numeric(unlist(strsplit(hnr, "-")))
+ hnr <- head(hnr[!is.na(hnr)], 1)
+ }
+ if(grepl(" ", hnr)){
+ hnr <- as.numeric(unlist(strsplit(hnr, " ")))
+ hnr <- head(hnr[!is.na(hnr)], 1)
+ }
+ if(grepl("/", hnr)){
+ hnr <- as.numeric(unlist(strsplit(hnr, "/")))
+ hnr <- head(hnr[!is.na(hnr)], 1)
+ } else {
+ hnr <- as.numeric(hnr)}
+ hnrz <- toupper(substr(trimws(substr(x, max(idx.hnr) + 1, nchar(x))), 1, 1))
+ if(nchar(hnrz) == 0) hnrz <- NA
+ strasse <- trimws(substr(x, 1, min(idx.hnr) - 1))
+ strasse <- sub("[[:digit:]]+[a-zA-Z]$", "", strasse)
+ return(list(strasse = strasse, hnr = hnr, hnrz = hnrz))
+}
diff --git a/KOR.addrlink/R/helper_split_number.R b/KOR.addrlink/R/helper_split_number.R
new file mode 100644
index 0000000..c19e89d
--- /dev/null
+++ b/KOR.addrlink/R/helper_split_number.R
@@ -0,0 +1,25 @@
+helper_split_number <-
+function(x, debug = FALSE){
+ if(debug) print(x)
+ x <- stringi::stri_replace_all_regex(str = x,
+ pattern = c("-", "/", "\\s+"), replace = c(" ", " ", " "),
+ vectorize_all = FALSE)
+ x <- trimws(x)
+ if(nchar(x) == 0){ return(data.frame(Hausnummer = NA, Zusatz = NA)) }
+ x_split <- unlist(strsplit(x, ''))
+ x_start <- head(which(x_split %in% as.character(0:9)), 1)
+ x <- substr(x, x_start, nchar(x))
+ x_split <- unlist(strsplit(x, ''))
+ if(" " %in% x_split){
+ x <- strsplit(x, ' ')[[1]][1]
+ x_split <- unlist(strsplit(x, ''))
+ }
+ idx <- suppressWarnings(as.numeric(x_split))
+ idx <- !is.na(idx)
+ idx_rle <- rle(idx)
+ hausnr <- as.numeric(substr(x, 1, head(idx_rle$length, 1)))
+ if(hausnr == ""){ hausnr <- NA }
+ zusatz <- substr(x, head(idx_rle$length, 1) + 1, head(idx_rle$length, 1) + 1)
+ if(zusatz == ""){ zusatz <- NA }
+ return(data.frame(Hausnummer = hausnr, Zusatz = zusatz))
+}
diff --git a/KOR.addrlink/R/l1score.R b/KOR.addrlink/R/l1score.R
new file mode 100644
index 0000000..86154ed
--- /dev/null
+++ b/KOR.addrlink/R/l1score.R
@@ -0,0 +1,8 @@
+l1score <-
+function(x){
+ stopifnot(is.vector(x) & is.numeric(x))
+ if(sum(is.na(x)) == length(x)){return(rep(1, length(x)))}
+ stopifnot(is.numeric(x))
+ x <- abs(x)
+ 1 - x / max(c(1, x), na.rm = TRUE)
+}
diff --git a/KOR.addrlink/R/match_number.R b/KOR.addrlink/R/match_number.R
new file mode 100644
index 0000000..34e2ade
--- /dev/null
+++ b/KOR.addrlink/R/match_number.R
@@ -0,0 +1,52 @@
+match_number <-
+function(record, Adressen, weights = c(.9, .1)){
+ valid <- Adressen[Adressen$Strasse == record[["Strasse"]], c("Hausnummer", "Hausnummernzusatz")]
+ if(is.na(record[["Hausnummer"]]) & is.na(record[["Hausnummernzusatz"]])){ #no info
+ return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+ }
+ if(is.na(record[["Hausnummer"]]) & !is.na(record[["Hausnummernzusatz"]])){ #no hnr, but hnrz
+ zusatz <- l1score(match(record[["Hausnummernzusatz"]], LETTERS) -
+ match(valid$Hausnummernzusatz, LETTERS)) * weights[2]
+ val <- max(zusatz, na.rm = TRUE)
+ ids <- which(zusatz == val)
+ if(length(ids) == 0){
+ return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+ }
+ if(length(ids) == 1){
+ return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[ids,]))}
+ return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+ }
+ if(!is.na(record[["Hausnummer"]]) & is.na(record[["Hausnummernzusatz"]])){ #hnr, no hnrz
+ hausnr_diff <- as.numeric(record[["Hausnummer"]]) - as.numeric(valid$Hausnummer)
+ hausnr <- l1score(hausnr_diff) * weights[1]
+ if(min(abs(hausnr_diff), na.rm = TRUE) > 4){
+ return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+ }
+ val <- max(hausnr, na.rm = TRUE)
+ ids <- which(hausnr == val)
+ if(length(ids) == 1){
+ return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[ids,]))}
+ return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+ }
+ hausnr_diff <- as.numeric(record[["Hausnummer"]]) - as.numeric(valid$Hausnummer)
+ hausnr <- l1score(hausnr_diff) * weights[1]
+ zusatz <- l1score(match(record[["Hausnummernzusatz"]], LETTERS) -
+ match(valid$Hausnummernzusatz, LETTERS)) * weights[2]
+ if(min(abs(hausnr_diff), na.rm = TRUE) > 4){#no hnr, but hnrz
+ val <- max(zusatz, na.rm = TRUE)
+ ids <- which(zusatz == val)
+ if(length(ids) == 0){
+ return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+ }
+ if(length(ids) == 1){
+ return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[ids,]))}
+ return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+ }
+ zusatz[is.na(zusatz)] <- - 0.05
+ score <- hausnr + zusatz
+ val <- max(score, na.rm = TRUE)
+ ids <- which(score == val)
+ if(length(ids) == 1){
+ return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[ids,]))}
+ return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+} \ No newline at end of file
diff --git a/KOR.addrlink/R/sanitize_street.R b/KOR.addrlink/R/sanitize_street.R
new file mode 100644
index 0000000..d7ff05f
--- /dev/null
+++ b/KOR.addrlink/R/sanitize_street.R
@@ -0,0 +1,15 @@
+sanitize_street <-
+function(x){
+ stopifnot(is.character(x), is.vector(x))
+ x <- tolower(x)
+ pattern <- c("\u00e4", "\u00fc", "\u00f6", "\u00df", "-", " ", "\\.", "str$")
+ replacement <- c("ae", "ue", "oe", "ss", "", "", "", "strasse")
+ x <- stringi::stri_replace_all_regex(str = x,
+ pattern = pattern, replace = replacement,
+ vectorize_all = FALSE)
+ x <- stringi::stri_trans_general(str = x, id = "Any-Latin;Latin-ASCII")
+ x <- stringi::stri_trans_nfc(str = x)
+ x <- stringi::stri_replace_all_regex(str = x,
+ pattern = "[[:punct:]]", replace = "", vectorize_all = FALSE)
+ return(x)
+}
diff --git a/KOR.addrlink/R/split_address.R b/KOR.addrlink/R/split_address.R
new file mode 100644
index 0000000..4fc71d5
--- /dev/null
+++ b/KOR.addrlink/R/split_address.R
@@ -0,0 +1,10 @@
+split_address <-
+function(x, debug = FALSE) {
+ stopifnot(is.character(x), is.vector(x))
+ vec_split_address <- Vectorize(helper_split_address, vectorize.args = "x",
+ USE.NAMES = FALSE, SIMPLIFY = FALSE)
+ res <- vec_split_address(x, debug = debug)
+ return(data.frame(Strasse = unlist(lapply(res, '[[', 1)),
+ Hausnummer = unlist(lapply(res, '[[', 2)),
+ Hausnummernzusatz = unlist(lapply(res, '[[', 3))))
+}
diff --git a/KOR.addrlink/R/split_number.R b/KOR.addrlink/R/split_number.R
new file mode 100644
index 0000000..4f7e584
--- /dev/null
+++ b/KOR.addrlink/R/split_number.R
@@ -0,0 +1,16 @@
+split_number <-
+function(x, debug = FALSE){
+ stopifnot(is.character(x), is.vector(x))
+ x <- trimws(x)
+ x[which(x == "0" | x == "NULL" | x == "")] <- NA
+ x_ready <- data.frame(Hausnummer = suppressWarnings(as.numeric(x)),
+ Hausnummernzusatz = NA)
+ ids <- which(!is.na(x) & is.na(x_ready$Hausnummer))
+ vec_split_hnr <- Vectorize(helper_split_number, vectorize.args = "x",
+ USE.NAMES = FALSE, SIMPLIFY = FALSE)
+ res <- vec_split_hnr(x[ids], debug = debug)
+ res <- data.frame(Hausnummer = unlist(lapply(res, '[[', 1)),
+ Hausnummernzusatz = unlist(lapply(res, '[[', 2)))
+ x_ready[ids,] <- res
+ return(x_ready)
+}
diff --git a/KOR.addrlink/data/Adressen.RData b/KOR.addrlink/data/Adressen.RData
new file mode 100644
index 0000000..af08d1e
--- /dev/null
+++ b/KOR.addrlink/data/Adressen.RData
Binary files differ
diff --git a/KOR.addrlink/data/df1.RData b/KOR.addrlink/data/df1.RData
new file mode 100644
index 0000000..032d1e1
--- /dev/null
+++ b/KOR.addrlink/data/df1.RData
Binary files differ
diff --git a/KOR.addrlink/data/df2.RData b/KOR.addrlink/data/df2.RData
new file mode 100644
index 0000000..41f7f4d
--- /dev/null
+++ b/KOR.addrlink/data/df2.RData
Binary files differ
diff --git a/KOR.addrlink/man/Adressen.Rd b/KOR.addrlink/man/Adressen.Rd
new file mode 100644
index 0000000..b04cc94
--- /dev/null
+++ b/KOR.addrlink/man/Adressen.Rd
@@ -0,0 +1,21 @@
+\name{Adressen}
+\docType{data}
+\alias{Adressen}
+\title{Address data from the city of Dortmund}
+\description{
+ This data set gives all the addresses in the city of Dortmund.
+}
+\usage{Adressen}
+\format{A data.frame
+ \tabular{lll}{
+ STRNAME \tab character \tab street name\cr
+ STRSL \tab numeric \tab street number\cr
+ HNR \tab numeric \tab house number\cr
+ HNRZ \tab character \tab additional letter\cr
+ RW \tab numeric \tab longitude \cr
+ HW \tab numeric \tab latitude \cr
+ UBZ \tab numeric \tab subdistrict number
+ }
+}
+\source{\url{https://open-data.dortmund.de}}
+\keyword{datasets} \ No newline at end of file
diff --git a/KOR.addrlink/man/KOR.addrlink-package.Rd b/KOR.addrlink/man/KOR.addrlink-package.Rd
new file mode 100644
index 0000000..9885e03
--- /dev/null
+++ b/KOR.addrlink/man/KOR.addrlink-package.Rd
@@ -0,0 +1,15 @@
+\name{KOR.addrlink-package}
+\alias{KOR.addrlink-package}
+\alias{KOR.addrlink}
+\docType{package}
+\title{KOR.addrlink}
+\description{Geocode address data from German municipalities}
+\details{
+\itemize{
+ \item \code{\link{split_address}} Splits strings into street, house number and addional letter
+ \item \code{\link{split_number}} Splits strings into house number and addional letter
+ \item \code{\link{addrlink}} Matches splitted address data to reference table
+}
+Matching is based on street name, house number and additional letter.
+}
+\author{Daniel Schürmann}
diff --git a/KOR.addrlink/man/addrlink.Rd b/KOR.addrlink/man/addrlink.Rd
new file mode 100644
index 0000000..099c1cc
--- /dev/null
+++ b/KOR.addrlink/man/addrlink.Rd
@@ -0,0 +1,47 @@
+\name{addrlink}
+\alias{addrlink}
+\title{Merge Data To Reference Index}
+\description{
+Takes two data.frames with address data and merges them together.
+}
+\usage{
+addrlink(df_ref, df_match,
+col_ref = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+fuzzy_threshold = 0.9, seed = 1234)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{df_ref}{data.frame with address references}
+ \item{df_match}{data.frame with addresses to be matched}
+ \item{col_ref}{character vector of length three, naming the df_ref columns which contain the steet names, house numbers and additional letters (in that order)}
+ \item{col_match}{character vector of length three, naming the df_match columns which contain the steet names, house numbers and additional letters (in that order)}
+ \item{fuzzy_threshold}{The threshold used for fuzzy matching street names}
+ \item{seed}{Seed for random numbers}
+}
+\details{
+The matching is done in four stages.
+
+\bold{Stage 1} (qAdress = 1). This is an exact match (highest quality, qscore = 1)
+
+\bold{Stage 2} (qAdress = 2). Exact match on street name, but no valid house
+number could be found. Be aware that random house numbers might be used.
+Consider setting your own seed. qscore indicates the match quality.
+See \code{\link{match_number}} for details.
+
+\bold{Stage 3} (qAdress = 3). No exact match on street name could be found.
+Street names are fuzzy matched. The method "jw" (Jaro-Winkler distance) from
+package stringdist is used (see stringdist-metrics). If 1 - [Jaro-Winkler distance]
+is greater than fuzzy_threshold, a match is assumed. The highest score is
+taken and house number matching is done as outlined in Stage 2.
+qscore is fuzzy_score*[house number score].
+
+\bold{Stage 4} (qAdress = 4). No match (qscore = 0)
+}
+\value{
+A list
+ \item{ret}{The merged dataset}
+ \item{QA}{The quality markers (qAdress and qscore)}
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{split_address}}, \code{\link{split_number}}} \ No newline at end of file
diff --git a/KOR.addrlink/man/df1.Rd b/KOR.addrlink/man/df1.Rd
new file mode 100644
index 0000000..cc56c68
--- /dev/null
+++ b/KOR.addrlink/man/df1.Rd
@@ -0,0 +1,18 @@
+\name{df1}
+\docType{data}
+\alias{df1}
+\title{Example dataset 1}
+\description{
+ This dataset contains separate street and house number information.
+}
+\usage{df1}
+\format{A data.frame
+ \tabular{lll}{
+ gross_strasse \tab character \tab street names\cr
+ hausnr \tab character \tab house number and additional letter\cr
+ Var1 \tab numeric \tab Variable 1\cr
+ Var2 \tab character \tab Variable 2
+ }
+}
+\source{Dortmunder Statistik}
+\keyword{datasets} \ No newline at end of file
diff --git a/KOR.addrlink/man/df2.Rd b/KOR.addrlink/man/df2.Rd
new file mode 100644
index 0000000..0937b62
--- /dev/null
+++ b/KOR.addrlink/man/df2.Rd
@@ -0,0 +1,17 @@
+\name{df2}
+\docType{data}
+\alias{df2}
+\title{Example dataset 2}
+\description{
+ This dataset contains concatenated street and house number information.
+}
+\usage{df2}
+\format{A data.frame
+ \tabular{lll}{
+ Adresse \tab character \tab street name, house number and addional letter\cr
+ Var1 \tab numeric \tab Variable 1\cr
+ Var2 \tab character \tab Variable 2
+ }
+}
+\source{Dortmunder Statistik}
+\keyword{datasets} \ No newline at end of file
diff --git a/KOR.addrlink/man/helper_split_address.Rd b/KOR.addrlink/man/helper_split_address.Rd
new file mode 100644
index 0000000..bc87965
--- /dev/null
+++ b/KOR.addrlink/man/helper_split_address.Rd
@@ -0,0 +1,18 @@
+\name{helper_split_address}
+\alias{helper_split_address}
+\title{Splits A Single Address Into Street, House Number And Additional Letter}
+\description{This is an internal function. Please use \code{\link{split_address}}}
+\usage{helper_split_address(x, debug = FALSE)
+}
+\arguments{
+ \item{x}{A character vector of length 1}
+ \item{debug}{If true, print(x)}
+}
+\value{
+A list with three elements
+ \item{strasse}{Extracted street name}
+ \item{hnr}{Extracted house number}
+ \item{hnrz}{Extracted extra letter}
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{split_address}}}
diff --git a/KOR.addrlink/man/helper_split_number.Rd b/KOR.addrlink/man/helper_split_number.Rd
new file mode 100644
index 0000000..570504c
--- /dev/null
+++ b/KOR.addrlink/man/helper_split_number.Rd
@@ -0,0 +1,18 @@
+\name{helper_split_number}
+\alias{helper_split_number}
+\title{Splits A Single House Number Into House Number And Additional Letter}
+\description{This is an internal function. Please use \code{\link{split_number}}}
+\usage{helper_split_number(x, debug = FALSE)
+}
+\arguments{
+ \item{x}{A character vector of length 1}
+ \item{debug}{If true, print(x)}
+}
+\value{
+A data.frame with two elements
+ \item{Hausnummer}{Extracted house number}
+ \item{Zusatz}{Extracted extra letter}
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{split_number}}}
+
diff --git a/KOR.addrlink/man/l1score.Rd b/KOR.addrlink/man/l1score.Rd
new file mode 100644
index 0000000..a60a7f3
--- /dev/null
+++ b/KOR.addrlink/man/l1score.Rd
@@ -0,0 +1,18 @@
+\name{l1score}
+\alias{l1score}
+\title{Calculate L1-Distance Based Scores}
+\description{
+Reversed normalized absolute distance from zero.
+}
+\usage{l1score(x)}
+\arguments{
+ \item{x}{A numeric vector}
+}
+\details{
+\deqn{1 - \frac{|x|}{\text{max}\{1, |x|\}}}{1 - |x| / (max(1, |x|)}
+}
+\value{
+A numeric vector of the same length as x
+}
+\author{Daniel Schürmann}
+
diff --git a/KOR.addrlink/man/match_number.Rd b/KOR.addrlink/man/match_number.Rd
new file mode 100644
index 0000000..b25e48c
--- /dev/null
+++ b/KOR.addrlink/man/match_number.Rd
@@ -0,0 +1,39 @@
+\name{match_number}
+\alias{match_number}
+\title{Find Best House Number Match Within Given Street}
+\description{This is an internal function. Please use \code{\link{addrlink}}}
+\usage{match_number(record, Adressen, weights = c(0.9, 0.1))}
+
+\arguments{
+ \item{record}{data.frame with one row and three columns (Strasse, Hausnummer, Hausnummernzusatz)}
+ \item{Adressen}{data.frame of all valid addresses (same columns as record data.frame)}
+ \item{weights}{The weighing factors between house number and additional letter}
+}
+
+\details{
+If no house number and no additional letter is provided, a random address in
+the given street is selected (qscore = 0).
+
+If only an additional letter but no house number is given and the letter is unique,
+returns the corresponding record (qscore = 0.05). Otherwise returns a random one
+as mentioned above (qscore = 0).
+
+If no additional letter, but house number is provided and the maximum distance to
+a valid house number is 4, return the closest match as calculated by
+\code{\link{l1score}} (qscore is the result of l1score). Otherwise a random record
+is returned (qscore = 0).
+
+If additional letter and house number are available and the house number distance
+is smaller then 4, calculates the l1scores of the house number distance and
+addional letters distance and selects the best match (qscore is the sum of both
+weighted l1scores). Otherwise a random record is selected (qscore = 0).
+}
+\value{
+A data.frame
+ \item{qscore}{The quality score of the match}
+ \item{Strasse}{matched street}
+ \item{Hausnummer}{matched house number}
+ \item{Hausnummernzusatz}{matched additional letter}
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{addrlink}}} \ No newline at end of file
diff --git a/KOR.addrlink/man/sanitize_street.Rd b/KOR.addrlink/man/sanitize_street.Rd
new file mode 100644
index 0000000..cce0ce5
--- /dev/null
+++ b/KOR.addrlink/man/sanitize_street.Rd
@@ -0,0 +1,28 @@
+\name{sanitize_street}
+\alias{sanitize_street}
+\title{Clean Steet Names And Make Them Mergeable}
+\description{
+This function replaces Umlauts, expands "str" to "strasse",
+transliterates all non-ascii characters, removes punctuation and converts
+to lower case.
+}
+\usage{sanitize_street(x)}
+
+\arguments{
+ \item{x}{A character vector containing the steet names}
+}
+\details{
+This is an internal function used in \code{addrlink}.
+Make sure house numbers have already been extracted.
+Use \code{split_number} or \code{split_address} for that.
+Only steet names can go into \code{sanitize_street}.
+}
+\value{
+A character vector of the same length as x containing the
+sanitized street names. }
+\author{Daniel Schürmann}
+
+\seealso{
+\code{\link{split_address}}, \code{\link{split_number}}, \code{\link{addrlink}}
+}
+
diff --git a/KOR.addrlink/man/split_address.Rd b/KOR.addrlink/man/split_address.Rd
new file mode 100644
index 0000000..91e801d
--- /dev/null
+++ b/KOR.addrlink/man/split_address.Rd
@@ -0,0 +1,28 @@
+\name{split_address}
+\alias{split_address}
+\title{Split Adresses Into Street, House Number And Additional Letter}
+\description{
+This function takes a character vector where each element is made up from a concatenation of
+street name, house number and possibly an additional letter and splits it into its parts.
+}
+\usage{split_address(x, debug = FALSE)}
+\arguments{
+ \item{x}{A character vector}
+ \item{debug}{If true, all records will be printed to the console}
+}
+\details{
+If the function fails, consider using \code{debug = TRUE}. This will print the record, which caused the error.
+Consider filing an issue on the linked git project (see DESCRIPTION).
+}
+\value{
+A data.frame with three columns
+ \item{Strasse}{A character column containing the extracted street names}
+ \item{Hausnummer}{House number}
+ \item{Hausnummernzusatz}{Additional letter}
+}
+\author{Daniel Schürmann}
+\note{For a more advanced, general purpose solution see libpostal.}
+\seealso{\code{\link{split_number}}}
+\examples{
+split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c"))
+}
diff --git a/KOR.addrlink/man/split_number.Rd b/KOR.addrlink/man/split_number.Rd
new file mode 100644
index 0000000..c8bf7a5
--- /dev/null
+++ b/KOR.addrlink/man/split_number.Rd
@@ -0,0 +1,27 @@
+\name{split_number}
+\alias{split_number}
+\title{Split house number into house number and additional letter}
+\description{
+This function takes a character vector where each element is made up from a concatenation of
+house number and possibly an additional letter and splits is into its parts.
+}
+\usage{split_number(x, debug = FALSE)}
+\arguments{
+ \item{x}{A character vector}
+ \item{debug}{If true, all records will be printed to the console}
+}
+\details{
+If the function fails, consider using \code{debug = TRUE}. This will print the record, which caused the error.
+Consider filing an issue on the linked git project (see DESCRIPTION).
+}
+\value{
+A data.frame with two columns
+ \item{Hausnummer}{House number}
+ \item{Hausnummernzusatz}{Additional letter}
+}
+\author{Daniel Schürmann}
+\note{For a more advanced, general purpose solution see libpostal.}
+\seealso{\code{\link{split_address}}}
+\examples{
+split_number(c("8-9 a", "1-2", "100a-102c"))
+}
diff --git a/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save b/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save
new file mode 100644
index 0000000..b6c22d2
--- /dev/null
+++ b/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save
@@ -0,0 +1,81 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+ Natural language support but running in an English locale
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> pkgname <- "KOR.addrlink"
+> source(file.path(R.home("share"), "R", "examples-header.R"))
+> options(warn = 1)
+> library('KOR.addrlink')
+>
+> base::assign(".oldSearch", base::search(), pos = 'CheckExEnv')
+> base::assign(".old_wd", base::getwd(), pos = 'CheckExEnv')
+> cleanEx()
+> nameEx("split_address")
+> ### * split_address
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: split_address
+> ### Title: Split Adresses Into Street, House Number And Additional Letter
+> ### Aliases: split_address
+>
+> ### ** Examples
+>
+> split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c"))
+ Strasse Hausnummer Hausnummernzusatz
+1 Teststr. 8 A
+2 Erster Weg 1 <NA>
+3 Ahornallee 102 C
+>
+>
+>
+> cleanEx()
+> nameEx("split_number")
+> ### * split_number
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: split_number
+> ### Title: Split house number into house number and additional letter
+> ### Aliases: split_number
+>
+> ### ** Examples
+>
+> split_number(c("8-9 a", "1-2", "100a-102c"))
+ Hausnummer Hausnummernzusatz
+1 8 <NA>
+2 1 <NA>
+3 100 a
+>
+>
+>
+> ### * <FOOTER>
+> ###
+> cleanEx()
+> options(digits = 7L)
+> base::cat("Time elapsed: ", proc.time() - base::get("ptime", pos = 'CheckExEnv'),"\n")
+Time elapsed: 0.155 0.019 0.224 0.009 0.006
+> grDevices::dev.off()
+null device
+ 1
+> ###
+> ### Local variables: ***
+> ### mode: outline-minor ***
+> ### outline-regexp: "\\(> \\)?### [*]+" ***
+> ### End: ***
+> quit('no')
diff --git a/KOR.addrlink/tests/test_l1score.R b/KOR.addrlink/tests/test_l1score.R
new file mode 100644
index 0000000..6708f80
--- /dev/null
+++ b/KOR.addrlink/tests/test_l1score.R
@@ -0,0 +1 @@
+KOR.addrlink:::l1score(c(-4, -1.3, 0, NA, 5.1, 10))
diff --git a/KOR.addrlink/tests/test_l1score.Rout.save b/KOR.addrlink/tests/test_l1score.Rout.save
new file mode 100644
index 0000000..6e952b2
--- /dev/null
+++ b/KOR.addrlink/tests/test_l1score.Rout.save
@@ -0,0 +1,23 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink:::l1score(c(-4, -1.3, 0, NA, 5.1, 10))
+[1] 0.60 0.87 1.00 NA 0.49 0.00
+>
+> proc.time()
+ user system elapsed
+ 0.242 0.062 0.266
diff --git a/KOR.addrlink/tests/test_sanitize_street.R b/KOR.addrlink/tests/test_sanitize_street.R
new file mode 100644
index 0000000..3543a6b
--- /dev/null
+++ b/KOR.addrlink/tests/test_sanitize_street.R
@@ -0,0 +1 @@
+KOR.addrlink:::sanitize_street(c("Binde-Strich-Strasse", "Teststr.", "A.-B.-C.-Str."))
diff --git a/KOR.addrlink/tests/test_sanitize_street.Rout.save b/KOR.addrlink/tests/test_sanitize_street.Rout.save
new file mode 100644
index 0000000..3bb9e3f
--- /dev/null
+++ b/KOR.addrlink/tests/test_sanitize_street.Rout.save
@@ -0,0 +1,23 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink:::sanitize_street(c("Binde-Strich-Strasse", "Teststr.", "A.-B.-C.-Str."))
+[1] "bindestrichstrasse" "teststrasse" "abcstrasse"
+>
+> proc.time()
+ user system elapsed
+ 0.259 0.059 0.289
diff --git a/KOR.addrlink/tests/test_split_address.R b/KOR.addrlink/tests/test_split_address.R
new file mode 100644
index 0000000..7b7db48
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_address.R
@@ -0,0 +1,2 @@
+KOR.addrlink::split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c",
+"Stra\u00dfe des 1. Mai 10/12", "Emmerich-K\u00e1lm\u00e1n-Stra\u00dfe 15"))
diff --git a/KOR.addrlink/tests/test_split_address.Rout.save b/KOR.addrlink/tests/test_split_address.Rout.save
new file mode 100644
index 0000000..65ab67b
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_address.Rout.save
@@ -0,0 +1,29 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink::split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c",
++ "Stra\u00dfe des 1. Mai 10/12", "Emmerich-K\u00e1lm\u00e1n-Stra\u00dfe 15"))
+ Strasse Hausnummer Hausnummernzusatz
+1 Teststr. 8 A
+2 Erster Weg 1 <NA>
+3 Ahornallee 102 C
+4 Straße des 1. Mai 10 <NA>
+5 Emmerich-Kálmán-Straße 15 <NA>
+>
+> proc.time()
+ user system elapsed
+ 0.277 0.053 0.299
diff --git a/KOR.addrlink/tests/test_split_number.R b/KOR.addrlink/tests/test_split_number.R
new file mode 100644
index 0000000..81d01eb
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_number.R
@@ -0,0 +1 @@
+KOR.addrlink::split_number(c("8-9 a", "1-2", "100a-102c", "2/3", "25/27a", "3 5"))
diff --git a/KOR.addrlink/tests/test_split_number.Rout.save b/KOR.addrlink/tests/test_split_number.Rout.save
new file mode 100644
index 0000000..766e419
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_number.Rout.save
@@ -0,0 +1,29 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink::split_number(c("8-9 a", "1-2", "100a-102c", "2/3", "25/27a", "3 5"))
+ Hausnummer Hausnummernzusatz
+1 8 <NA>
+2 1 <NA>
+3 100 a
+4 2 <NA>
+5 25 <NA>
+6 3 <NA>
+>
+> proc.time()
+ user system elapsed
+ 0.246 0.054 0.274
diff --git a/KOR.addrlink/vignettes/Example.Rnw b/KOR.addrlink/vignettes/Example.Rnw
new file mode 100644
index 0000000..aef9c2e
--- /dev/null
+++ b/KOR.addrlink/vignettes/Example.Rnw
@@ -0,0 +1,143 @@
+\documentclass{article}
+%\VignetteIndexEntry{Example}
+\usepackage[utf8]{inputenc}
+\begin{document}
+\SweaveOpts{concordance=TRUE}
+\title{Using KOR.addrlink}
+\author{Daniel Sch\"urmann}
+\date{February 29, 2024}
+\maketitle
+
+\section{Introduction}
+
+Consider a data set with semi-structured address data, e.g. street and house number as a concatenated string,
+wrongly spelled street names or non-existing house numbers. This data set (referred to as df\_match) should be
+mapped to a complete list of valid addresses within the given municipality. The latter data set is
+called df\_ref and may include further information like coordinates of addresses or district information.
+KOR.addrlink tries to solve this problem specifically for German municipalities as the package is specialized
+on German address schemes.
+
+\section{Reference data}
+
+First, a complete list of reference addresses (df\_ref) is needed. An example
+data.fame named "Adressen" is shown below.
+
+<<>>=
+library(KOR.addrlink)
+Adressen[c(sample(which(is.na(Adressen$HNRZ)), 4),
+ sample(which(!is.na(Adressen$HNRZ)), 2)),]
+@
+
+The columns used for the matching procedure are STRNAME (street name), HNR (house number)
+and HNRZ (additional letter). This vignette illustrates the merging workflow on two sample data sets called df1 and df2.
+
+\section{Example 1}
+df1 has address information in columns gross\_strasse and housnr.
+The columns Var1 and Var2 provide non-address related information about
+the individuals. Row 1183 shows that the column hausnr needs to be split
+into house number and additional letter before addresses can be matched.
+The function split\_number is provided for that task.
+
+<<>>=
+df1[1180:(1183+6),]
+@
+
+split\_number takes hausnr and creates a data.frame with columns "Hausnummer"
+(house number) and "Hausnummernzusatz" (additional letter).
+
+<<>>=
+df1 <- cbind(df1, split_number(df1$hausnr))
+df1[1180:(1183+6),]
+@
+
+addrlink merges the two data sets. For both data sets, the columns referring
+to steet name, house number and additional letter need to be specified
+in exactly that order (parameter col\_ref and col\_match).
+
+<<>>=
+# column hausnr is no longer needed
+df1 <- within(df1, rm(hausnr))
+df1_matched <- addrlink(df_ref = Adressen,
+ col_ref = c("STRNAME", "HNR", "HNRZ"),
+ df_match = df1,
+ col_match = c("gross_strasse", "Hausnummer", "Hausnummernzusatz"))
+@
+
+The result is a list with two data.frames
+\begin{itemize}
+\item ret: The merged data set
+\item QA: Indicators showing the match quality
+\end{itemize}
+
+<<>>=
+head(df1_matched$ret)
+table(df1_matched$QA$qAddress)
+@
+
+qAdress states the stage within the matching procedure that yielded the match.
+Out of the 10000 records, 9670 could be merged directly. 72 had a valid street
+name, but an invalid house number. 157 records had (possibly) misspelled street
+names and 101 records could not be matched at all.
+
+\section{Example 2}
+
+The second data set has a single column "Adresse", which includes street names
+and house numbers. Thus, this column needs to be split by the function
+split\_address.
+
+<<>>=
+head(within(df2, Adresse <- trimws(Adresse)))
+@
+
+split\_number creates a data.frame with columns "Strasse" (street) "Hausnummer"
+(house number) and "Hausnummernzusatz" (additional letter) from the column
+"Adresse".
+
+<<>>=
+df2 <- cbind(df2, split_address(df2$Adresse))
+within(df2, Adresse <- trimws(Adresse))[23:(23+6),]
+@
+
+Again, addrlink merges the two data sets. The parameter fuzzy\_threshold
+sets the threshold for fuzzy matching of misspelled street names. A value
+of 1 means no fuzzy matching and 0 means forced fuzzy matches for all records.
+If a steet name could be matched, but the provided house number does not exist, addrlink
+may randomly assign a valid house number to that record. A seed is always set
+to ensure reproducibility. Customization is possible via the parameter seed.
+
+<<>>=
+# column Adresse is no longer needed
+df2 <- within(df2, rm(Adresse))
+df2_matched <- addrlink(df_ref = Adressen,
+ col_ref = c("STRNAME", "HNR", "HNRZ"),
+ df_match = df2,
+ col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"),
+ fuzzy_threshold = .9, seed = 1234)
+@
+
+<<>>=
+head(df2_matched$ret)
+table(df2_matched$QA$qAddress)
+@
+
+49 records had invalid house numbers and one record was matched by
+fuzzy matching. This record can be inspected in detail.
+
+<<>>=
+id <- which(df2_matched$QA$qAddress == 3)
+df2_matched$ret[id,]
+df2_matched$QA[id,]
+@
+
+In this case the fuzzy matching procedure was most likely correct
+(St.-Georg-Str. matched SANKT-GEORG-STRA{\ss}E).
+
+The number of cases with correct street name and randomly assigned house
+numbers is 10.
+
+<<>>=
+sum(df2_matched$QA$qscore == 0)
+@
+
+
+\end{document}
diff --git a/README.html b/README.html
new file mode 100644
index 0000000..1494e7f
--- /dev/null
+++ b/README.html
@@ -0,0 +1,18 @@
+<h1 id="kor.addrlink">KOR.addrlink</h1>
+<h3 id="beschreibung">Beschreibung</h3>
+<p>Die Ausgangssituation ist eine Datenlieferung mit Adressdaten als
+Textfeld. Diese sollen anhand einer vorhandenen Adressdatenbank
+geokodiert werden (z.B. den UBZ zugeordnet werden)</p>
+<h2 id="installation">Installation</h2>
+<p>install.packages(“KOR.addrlink”)</p>
+<h2 id="benutzung">Benutzung</h2>
+<p>Am besten in die Vignette gucken. Da sind zwei verschiedene Beispiele
+drin.</p>
+<h2 id="hilfe">Hilfe</h2>
+<p>d.schuermann@2718282.net</p>
+<h2 id="beitragen">Beitragen</h2>
+<p>Issues und pull requests sind gerne gesehen!</p>
+<h2 id="autor">Autor</h2>
+<p>Daniel Schürmann</p>
+<h2 id="lizenz">Lizenz</h2>
+<p>GPL-3</p> \ No newline at end of file