First cran versionHEAD v1.0.1 master

author: Daniel Schürmann <d.schuermann@2718282.net> 2024-03-10 12:22:13 +0100
committer: Daniel Schürmann <d.schuermann@2718282.net> 2024-03-10 12:22:13 +0100
commit: e02f41b9b1dc3c45f6626e8b01fee2acb5b905d4 (patch)
tree: 8fa7087b5f5fcfe5513c97ef7fa6f0fdb9edf614
download: KOR.addrlink-390f76d388fa183fdf5448219525b79cefe909f6.tar.gz
KOR.addrlink-390f76d388fa183fdf5448219525b79cefe909f6.zip
36 files changed, 940 insertions, 0 deletions
diff --git a/KOR.addrlink/DESCRIPTION b/KOR.addrlink/DESCRIPTION
new file mode 100644
index 0000000..8761e79
--- /dev/null
+++ b/KOR.addrlink/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: KOR.addrlink
+Type: Package
+Title: Matching Address Data to Reference Index
+Version: 1.0.1
+Date: 2024-03-02
+Author: Daniel Schürmann [aut, cre]
+Authors@R: person("Daniel", "Schürmann", role = c("aut", "cre"), email = "d.schuermann@2718282.net")
+Maintainer: Daniel Schürmann <d.schuermann@2718282.net>
+Depends: R (>= 3.4)
+Imports: stringdist, stringi
+LazyData: true
+Description: Matches a data set with semi-structured address data, 
+ e.g., street and house number as a concatenated string,
+ wrongly spelled street names or non-existing house numbers to a 
+ reference index. The methods are specifically designed for German 
+ municipalities ('KOR'-community) and German address schemes.
+License: GPL-3
+Encoding: UTF-8
+URL: https://git-kor.stadtdo.de
+BugReports: https://git-kor.stadtdo.de/stadt-dortmund/adressdaten/-/issues
+
diff --git a/KOR.addrlink/NAMESPACE b/KOR.addrlink/NAMESPACE
new file mode 100644
index 0000000..6a6a065
--- /dev/null
+++ b/KOR.addrlink/NAMESPACE
@@ -0,0 +1,4 @@
+export("addrlink", "split_address", "split_number")
+importFrom("utils", "head")
+importFrom("stringi", "stri_replace_all_regex", "stri_trans_general", "stri_trans_nfc")
+importFrom("stringdist", "stringsimmatrix")
diff --git a/KOR.addrlink/R/addrlink.R b/KOR.addrlink/R/addrlink.R
new file mode 100644
index 0000000..49deec8
--- /dev/null
+++ b/KOR.addrlink/R/addrlink.R
@@ -0,0 +1,111 @@
+addrlink <-
+function(df_ref, df_match, 
+		col_ref = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		fuzzy_threshold = .9, seed = 1234){
+	
+	stopifnot(is.data.frame(df_ref), is.data.frame(df_match), 
+		is.vector(col_ref), is.character(col_ref), 
+		is.vector(col_match), is.character(col_match))
+	stopifnot(unique(colnames(df_ref)) == colnames(df_ref))
+	stopifnot(unique(colnames(df_match)) == colnames(df_match))
+	stopifnot(any(!is.na(col_ref), !is.na(col_match)))
+	stopifnot(length(col_ref) == 3, length(col_match) == 3)
+	stopifnot(col_ref %in% colnames(df_ref))
+	stopifnot(col_match %in% colnames(df_match))
+	stopifnot(fuzzy_threshold < 1, fuzzy_threshold > 0)
+	stopifnot(seed > 0)
+	set.seed(seed)
+	
+	Adressen <- df_ref[, col_ref]
+	stopifnot(is.character(Adressen[,1]), is.numeric(Adressen[,2]), is.character(Adressen[,3]))
+	stopifnot(nrow(unique(Adressen)) == nrow(Adressen))
+	stopifnot(nrow(Adressen) > 0)
+	colnames(Adressen) <- c("Strasse", "Hausnummer", "Hausnummernzusatz")
+	Adressen$id.addr <- 1:nrow(Adressen)
+	Adressen$Strasse <- sanitize_street(Adressen$Strasse)
+	Adressen$Hausnummernzusatz <- tolower(Adressen$Hausnummernzusatz)
+	
+	df <- df_match[, col_match]
+	stopifnot(is.character(df[,1]), is.numeric(df[,2]), 
+		is.character(df[,3]))
+	stopifnot(nrow(df) > 0)
+	colnames(df) <- c("Strasse", "Hausnummer", "Hausnummernzusatz")
+	df$id.df <- 1:nrow(df)
+	df$Strasse <- sanitize_street(df$Strasse)
+	df$Hausnummernzusatz <- tolower(df$Hausnummernzusatz)
+	
+	# first pass (direct matches)
+	fp <- merge(x = df, y = Adressen, 
+		by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		incomparables = NULL)
+	if(nrow(fp) > 0){
+	fp$qAddress <- 1
+	fp$qscore <- 1
+	} else fp <- data.frame(Strasse = character(), Hausnummer = numeric(), 
+		Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), 
+		qAddress = numeric(), qscore = numeric())
+	
+	# second pass (street correct)
+	tmp <- df[!(df$id.df %in% fp$id.df) & !is.na(df$Strasse),]
+	sp <- merge(x = tmp, y = unique(Adressen[, "Strasse", drop = FALSE]), 
+		by.x = c("Strasse"), 
+		by.y = c("Strasse"), 
+		incomparables = NULL)
+	if(nrow(sp) > 0){
+	sp <- cbind(id.df = sp$id.df, 
+		do.call(rbind, apply(X = sp, MARGIN = 1, 
+		FUN = match_number, Adressen = Adressen)))
+	sp$qAddress <- 2
+	sp <- merge(x = sp, y = Adressen, 
+		by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		incomparables = NULL)
+	} else sp <- data.frame(Strasse = character(), Hausnummer = numeric(), 
+		Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), 
+		qAddress = numeric(), qscore = numeric())
+	
+	## third pass (fuzzy matches)
+	tp <- df[!(df$id.df %in% c(fp$id.df, sp$id.df)) & !is.na(df$Strasse),]
+	uSTR <- unique(Adressen$Strasse)
+	tmp <- stringdist::stringsimmatrix(a = tp$Strasse, b = uSTR, method = "jw", 
+		nthread = max(1, floor(parallel::detectCores() / 2)))
+	threshold <- which(apply(tmp, MARGIN = 1, FUN = max) > fuzzy_threshold)
+	if(length(threshold) > 0){
+	tp$Strasse[threshold] <- uSTR[unlist(apply(tmp, MARGIN = 1, FUN = which.max))[threshold]]
+	tp <- merge(x = tp, y = unique(Adressen[, "Strasse", drop = FALSE]), 
+		by.x = c("Strasse"), 
+		by.y = c("Strasse"), 
+		incomparables = NULL)
+	tp <- cbind(id.df = tp$id.df, 
+		do.call(rbind, apply(X = tp, MARGIN = 1, 
+		FUN = match_number, Adressen = Adressen)))
+	tp$qscore <- tp$qscore * apply((tmp[threshold, , drop = FALSE]), 1, max)
+	tp$qAddress <- 3
+	tp <- merge(x = tp, y = Adressen, 
+		by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+		incomparables = NULL)
+	} else tp <- data.frame(Strasse = character(), Hausnummer = numeric(), 
+		Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), 
+		qAddress = numeric(), qscore = numeric())
+	
+	# no match
+	nomatch <- df[!(df$id.df %in% c(fp$id.df, sp$id.df, tp$id.df)),]
+	if(nrow(nomatch) > 0){
+	nomatch$Strasse <- NA
+	nomatch$Hausnummer <- NA
+	nomatch$Hausnummernzusatz <- NA
+	nomatch$id.addr <- NA
+	nomatch$qAddress <- 4
+	nomatch$qscore <- 0
+	} else nomatch <- data.frame(Strasse = character(), Hausnummer = numeric(), 
+		Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), 
+		qAddress = numeric(), qscore = numeric())
+
+	# results 
+	res <- rbind(fp, sp, tp, nomatch)
+	ret <- cbind(df_match[res$id.df,], df_ref[res$id.addr,])
+	return(list(ret = ret, QA = res[, c("qAddress", "qscore")]))
+}
diff --git a/KOR.addrlink/R/helper_split_address.R b/KOR.addrlink/R/helper_split_address.R
new file mode 100644
index 0000000..e750ae8
--- /dev/null
+++ b/KOR.addrlink/R/helper_split_address.R
@@ -0,0 +1,33 @@
+helper_split_address <-
+function(x, debug = FALSE){
+	if(debug) print(x)
+	x <- trimws(x)
+	x_split <- unlist(strsplit(x, ''))
+	num.idx <- which(x_split %in% as.character(0:9))
+	if(length(num.idx) == 0){ return(list(strasse = x, hnr = NA, hnrz = NA)) }
+	max.num.idx <- max(num.idx)
+	num.extra <- which(x_split %in% c(" ", "-", "/"))
+	num.extra <- num.extra[min(num.idx) < num.extra & max.num.idx > num.extra]
+	num.idx <- c(num.idx, num.extra)
+	idx.subs <- lapply(min(1, (max.num.idx - 7)):max.num.idx, function(x) x:max.num.idx)
+	idx.hnr <- idx.subs[[min(which(unlist(lapply(idx.subs, function(y) all(y %in% num.idx)))))]]
+	hnr <- trimws(substr(x, min(idx.hnr), max(idx.hnr)))
+	if(grepl("-", hnr)){ 
+		hnr <- as.numeric(unlist(strsplit(hnr, "-")))
+		hnr <- head(hnr[!is.na(hnr)], 1)
+	}
+	if(grepl(" ", hnr)){ 
+		hnr <- as.numeric(unlist(strsplit(hnr, " ")))
+		hnr <- head(hnr[!is.na(hnr)], 1)
+	}
+	if(grepl("/", hnr)){ 
+		hnr <- as.numeric(unlist(strsplit(hnr, "/")))
+		hnr <- head(hnr[!is.na(hnr)], 1)
+	} else {
+	hnr <- as.numeric(hnr)}
+	hnrz <- toupper(substr(trimws(substr(x, max(idx.hnr) + 1, nchar(x))), 1, 1))
+	if(nchar(hnrz) == 0) hnrz <- NA
+	strasse <- trimws(substr(x, 1, min(idx.hnr) - 1))
+	strasse <- sub("[[:digit:]]+[a-zA-Z]$", "", strasse)
+	return(list(strasse = strasse, hnr = hnr, hnrz = hnrz))
+}
diff --git a/KOR.addrlink/R/helper_split_number.R b/KOR.addrlink/R/helper_split_number.R
new file mode 100644
index 0000000..c19e89d
--- /dev/null
+++ b/KOR.addrlink/R/helper_split_number.R
@@ -0,0 +1,25 @@
+helper_split_number <-
+function(x, debug = FALSE){
+	if(debug) print(x)
+	x <- stringi::stri_replace_all_regex(str = x, 
+		pattern = c("-", "/", "\\s+"), replace = c(" ", " ", " "), 
+		vectorize_all = FALSE)
+	x <- trimws(x)
+	if(nchar(x) == 0){ return(data.frame(Hausnummer = NA, Zusatz = NA)) }
+	x_split <- unlist(strsplit(x, ''))
+	x_start <- head(which(x_split %in% as.character(0:9)), 1)
+	x <- substr(x, x_start, nchar(x))
+	x_split <- unlist(strsplit(x, ''))
+	if(" " %in% x_split){
+		x <- strsplit(x, ' ')[[1]][1]
+		x_split <- unlist(strsplit(x, ''))
+	}
+	idx <- suppressWarnings(as.numeric(x_split))
+	idx <- !is.na(idx)
+	idx_rle <- rle(idx)
+	hausnr <- as.numeric(substr(x, 1, head(idx_rle$length, 1)))
+	if(hausnr == ""){ hausnr <- NA }
+	zusatz <- substr(x, head(idx_rle$length, 1) + 1, head(idx_rle$length, 1) + 1)
+	if(zusatz == ""){ zusatz <- NA }
+	return(data.frame(Hausnummer = hausnr, Zusatz = zusatz))
+}
diff --git a/KOR.addrlink/R/l1score.R b/KOR.addrlink/R/l1score.R
new file mode 100644
index 0000000..86154ed
--- /dev/null
+++ b/KOR.addrlink/R/l1score.R
@@ -0,0 +1,8 @@
+l1score <-
+function(x){
+	stopifnot(is.vector(x) & is.numeric(x))
+	if(sum(is.na(x)) == length(x)){return(rep(1, length(x)))}
+	stopifnot(is.numeric(x))
+	x <- abs(x)
+	1 - x / max(c(1, x), na.rm = TRUE)
+}
diff --git a/KOR.addrlink/R/match_number.R b/KOR.addrlink/R/match_number.R
new file mode 100644
index 0000000..34e2ade
--- /dev/null
+++ b/KOR.addrlink/R/match_number.R
@@ -0,0 +1,52 @@
+match_number <- 
+function(record, Adressen, weights = c(.9, .1)){
+	valid <- Adressen[Adressen$Strasse == record[["Strasse"]], c("Hausnummer", "Hausnummernzusatz")]
+	if(is.na(record[["Hausnummer"]]) & is.na(record[["Hausnummernzusatz"]])){ #no info
+		return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+	}
+	if(is.na(record[["Hausnummer"]]) & !is.na(record[["Hausnummernzusatz"]])){ #no hnr, but hnrz
+		zusatz <- l1score(match(record[["Hausnummernzusatz"]], LETTERS) - 
+			match(valid$Hausnummernzusatz, LETTERS)) * weights[2]
+		val <- max(zusatz, na.rm = TRUE)
+		ids <- which(zusatz == val)
+		if(length(ids) == 0){
+			return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+		}
+		if(length(ids) == 1){
+			return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[ids,]))}
+		return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+	}
+	if(!is.na(record[["Hausnummer"]]) & is.na(record[["Hausnummernzusatz"]])){ #hnr, no hnrz
+		hausnr_diff <- as.numeric(record[["Hausnummer"]]) - as.numeric(valid$Hausnummer)
+		hausnr <- l1score(hausnr_diff) * weights[1]
+		if(min(abs(hausnr_diff), na.rm = TRUE) > 4){
+			return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+		}
+		val <- max(hausnr, na.rm = TRUE)
+		ids <- which(hausnr == val)
+		if(length(ids) == 1){
+			return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[ids,]))}
+		return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+	}
+	hausnr_diff <- as.numeric(record[["Hausnummer"]]) - as.numeric(valid$Hausnummer)
+	hausnr <- l1score(hausnr_diff) * weights[1]
+	zusatz <- l1score(match(record[["Hausnummernzusatz"]], LETTERS) - 
+		match(valid$Hausnummernzusatz, LETTERS)) * weights[2]
+	if(min(abs(hausnr_diff), na.rm = TRUE) > 4){#no hnr, but hnrz
+		val <- max(zusatz, na.rm = TRUE)
+		ids <- which(zusatz == val)
+		if(length(ids) == 0){
+			return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),]))
+		}
+		if(length(ids) == 1){
+			return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[ids,]))}
+		return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+	}
+	zusatz[is.na(zusatz)] <- - 0.05
+	score <- hausnr + zusatz
+	val <- max(score, na.rm = TRUE)
+	ids <- which(score == val)
+	if(length(ids) == 1){
+		return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[ids,]))}
+	return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[sample(ids, 1),]))
+}
+\ No newline at end of file
diff --git a/KOR.addrlink/R/sanitize_street.R b/KOR.addrlink/R/sanitize_street.R
new file mode 100644
index 0000000..d7ff05f
--- /dev/null
+++ b/KOR.addrlink/R/sanitize_street.R
@@ -0,0 +1,15 @@
+sanitize_street <-
+function(x){
+	stopifnot(is.character(x), is.vector(x))
+	x <- tolower(x)
+	pattern <- c("\u00e4", "\u00fc", "\u00f6", "\u00df", "-", " ", "\\.", "str$")
+	replacement <- c("ae", "ue", "oe", "ss", "", "", "", "strasse")
+	x <- stringi::stri_replace_all_regex(str = x, 
+		pattern = pattern, replace = replacement, 
+		vectorize_all = FALSE)
+	x <- stringi::stri_trans_general(str = x, id = "Any-Latin;Latin-ASCII")
+	x <- stringi::stri_trans_nfc(str = x)
+	x <- stringi::stri_replace_all_regex(str = x, 
+		pattern = "[[:punct:]]", replace = "", vectorize_all = FALSE)
+	return(x)
+}
diff --git a/KOR.addrlink/R/split_address.R b/KOR.addrlink/R/split_address.R
new file mode 100644
index 0000000..4fc71d5
--- /dev/null
+++ b/KOR.addrlink/R/split_address.R
@@ -0,0 +1,10 @@
+split_address <-
+function(x, debug = FALSE) { 
+	stopifnot(is.character(x), is.vector(x))
+	vec_split_address <- Vectorize(helper_split_address, vectorize.args = "x", 
+		USE.NAMES = FALSE, SIMPLIFY = FALSE)
+	res <- vec_split_address(x, debug = debug)
+	return(data.frame(Strasse = unlist(lapply(res, '[[', 1)), 
+	Hausnummer = unlist(lapply(res, '[[', 2)), 
+	Hausnummernzusatz = unlist(lapply(res, '[[', 3))))
+}
diff --git a/KOR.addrlink/R/split_number.R b/KOR.addrlink/R/split_number.R
new file mode 100644
index 0000000..4f7e584
--- /dev/null
+++ b/KOR.addrlink/R/split_number.R
@@ -0,0 +1,16 @@
+split_number <-
+function(x, debug = FALSE){ 
+	stopifnot(is.character(x), is.vector(x))
+	x <- trimws(x)
+	x[which(x == "0" | x == "NULL" | x == "")] <- NA
+	x_ready <- data.frame(Hausnummer = suppressWarnings(as.numeric(x)), 
+		Hausnummernzusatz = NA)
+	ids <- which(!is.na(x) & is.na(x_ready$Hausnummer))
+	vec_split_hnr <- Vectorize(helper_split_number, vectorize.args = "x", 
+		USE.NAMES = FALSE, SIMPLIFY = FALSE)
+	res <- vec_split_hnr(x[ids], debug = debug)
+	res <- data.frame(Hausnummer = unlist(lapply(res, '[[', 1)), 
+		Hausnummernzusatz = unlist(lapply(res, '[[', 2)))
+	x_ready[ids,] <- res
+	return(x_ready)
+}
diff --git a/KOR.addrlink/data/Adressen.RData b/KOR.addrlink/data/Adressen.RData
new file mode 100644
index 0000000..af08d1e
--- /dev/null
+++ b/KOR.addrlink/data/Adressen.RData
diff --git a/KOR.addrlink/data/df1.RData b/KOR.addrlink/data/df1.RData
new file mode 100644
index 0000000..032d1e1
--- /dev/null
+++ b/KOR.addrlink/data/df1.RData
diff --git a/KOR.addrlink/data/df2.RData b/KOR.addrlink/data/df2.RData
new file mode 100644
index 0000000..41f7f4d
--- /dev/null
+++ b/KOR.addrlink/data/df2.RData
diff --git a/KOR.addrlink/man/Adressen.Rd b/KOR.addrlink/man/Adressen.Rd
new file mode 100644
index 0000000..b04cc94
--- /dev/null
+++ b/KOR.addrlink/man/Adressen.Rd
@@ -0,0 +1,21 @@
+\name{Adressen}
+\docType{data}
+\alias{Adressen}
+\title{Address data from the city of Dortmund}
+\description{
+  This data set gives all the addresses in the city of Dortmund. 
+}
+\usage{Adressen}
+\format{A data.frame
+  \tabular{lll}{
+    STRNAME   \tab character \tab street name\cr
+    STRSL \tab numeric \tab street number\cr
+    HNR    \tab numeric \tab house number\cr
+    HNRZ    \tab character \tab additional letter\cr
+    RW   \tab numeric \tab longitude \cr
+    HW     \tab numeric \tab latitude \cr
+    UBZ     \tab numeric \tab subdistrict number
+  }
+}
+\source{\url{https://open-data.dortmund.de}}
+\keyword{datasets}
+\ No newline at end of file
diff --git a/KOR.addrlink/man/KOR.addrlink-package.Rd b/KOR.addrlink/man/KOR.addrlink-package.Rd
new file mode 100644
index 0000000..9885e03
--- /dev/null
+++ b/KOR.addrlink/man/KOR.addrlink-package.Rd
@@ -0,0 +1,15 @@
+\name{KOR.addrlink-package}
+\alias{KOR.addrlink-package}
+\alias{KOR.addrlink}
+\docType{package}
+\title{KOR.addrlink}
+\description{Geocode address data from German municipalities}
+\details{
+\itemize{
+  \item \code{\link{split_address}} Splits strings into street, house number and addional letter
+  \item \code{\link{split_number}} Splits strings into house number and addional letter
+  \item \code{\link{addrlink}} Matches splitted address data to reference table
+}
+Matching is based on street name, house number and additional letter. 
+}
+\author{Daniel Schürmann}
diff --git a/KOR.addrlink/man/addrlink.Rd b/KOR.addrlink/man/addrlink.Rd
new file mode 100644
index 0000000..099c1cc
--- /dev/null
+++ b/KOR.addrlink/man/addrlink.Rd
@@ -0,0 +1,47 @@
+\name{addrlink}
+\alias{addrlink}
+\title{Merge Data To Reference Index}
+\description{
+Takes two data.frames with address data and merges them together. 
+}
+\usage{
+addrlink(df_ref, df_match, 
+col_ref = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+fuzzy_threshold = 0.9, seed = 1234)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{df_ref}{data.frame with address references}
+  \item{df_match}{data.frame with addresses to be matched}
+  \item{col_ref}{character vector of length three, naming the df_ref columns which contain the steet names, house numbers and additional letters (in that order)}
+  \item{col_match}{character vector of length three, naming the df_match columns which contain the steet names, house numbers and additional letters (in that order)}
+  \item{fuzzy_threshold}{The threshold used for fuzzy matching street names}
+  \item{seed}{Seed for random numbers}
+}
+\details{
+The matching is done in four stages. 
+
+\bold{Stage 1} (qAdress = 1). This is an exact match (highest quality, qscore = 1)
+
+\bold{Stage 2} (qAdress = 2). Exact match on street name, but no valid house 
+number could be found. Be aware that random house numbers might be used. 
+Consider setting your own seed. qscore indicates the match quality. 
+See \code{\link{match_number}} for details.
+
+\bold{Stage 3} (qAdress = 3). No exact match on street name could be found. 
+Street names are fuzzy matched. The method "jw" (Jaro-Winkler distance) from 
+package stringdist is used (see stringdist-metrics). If 1 - [Jaro-Winkler distance] 
+is greater than fuzzy_threshold, a match is assumed. The highest score is 
+taken and house number matching is done as outlined in Stage 2. 
+qscore is fuzzy_score*[house number score]. 
+
+\bold{Stage 4} (qAdress = 4). No match (qscore = 0)
+}
+\value{
+A list
+  \item{ret}{The merged dataset}
+  \item{QA}{The quality markers (qAdress and qscore)}
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{split_address}}, \code{\link{split_number}}}
+\ No newline at end of file
diff --git a/KOR.addrlink/man/df1.Rd b/KOR.addrlink/man/df1.Rd
new file mode 100644
index 0000000..cc56c68
--- /dev/null
+++ b/KOR.addrlink/man/df1.Rd
@@ -0,0 +1,18 @@
+\name{df1}
+\docType{data}
+\alias{df1}
+\title{Example dataset 1}
+\description{
+  This dataset contains separate street and house number information.
+}
+\usage{df1}
+\format{A data.frame
+  \tabular{lll}{
+    gross_strasse   \tab character \tab street names\cr
+    hausnr \tab character \tab house number and additional letter\cr
+    Var1    \tab numeric \tab Variable 1\cr
+    Var2    \tab character \tab Variable 2
+  }
+}
+\source{Dortmunder Statistik}
+\keyword{datasets}
+\ No newline at end of file
diff --git a/KOR.addrlink/man/df2.Rd b/KOR.addrlink/man/df2.Rd
new file mode 100644
index 0000000..0937b62
--- /dev/null
+++ b/KOR.addrlink/man/df2.Rd
@@ -0,0 +1,17 @@
+\name{df2}
+\docType{data}
+\alias{df2}
+\title{Example dataset 2}
+\description{
+  This dataset contains concatenated street and house number information.
+}
+\usage{df2}
+\format{A data.frame
+  \tabular{lll}{
+    Adresse   \tab character \tab street name, house number and addional letter\cr
+    Var1    \tab numeric \tab Variable 1\cr
+    Var2    \tab character \tab Variable 2
+  }
+}
+\source{Dortmunder Statistik}
+\keyword{datasets}
+\ No newline at end of file
diff --git a/KOR.addrlink/man/helper_split_address.Rd b/KOR.addrlink/man/helper_split_address.Rd
new file mode 100644
index 0000000..bc87965
--- /dev/null
+++ b/KOR.addrlink/man/helper_split_address.Rd
@@ -0,0 +1,18 @@
+\name{helper_split_address}
+\alias{helper_split_address}
+\title{Splits A Single Address Into Street, House Number And Additional Letter}
+\description{This is an internal function. Please use \code{\link{split_address}}}
+\usage{helper_split_address(x, debug = FALSE)
+}
+\arguments{
+  \item{x}{A character vector of length 1}
+  \item{debug}{If true, print(x)}
+}
+\value{
+A list with three elements
+  \item{strasse}{Extracted street name}
+  \item{hnr}{Extracted house number}
+  \item{hnrz}{Extracted extra letter}   
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{split_address}}}
diff --git a/KOR.addrlink/man/helper_split_number.Rd b/KOR.addrlink/man/helper_split_number.Rd
new file mode 100644
index 0000000..570504c
--- /dev/null
+++ b/KOR.addrlink/man/helper_split_number.Rd
@@ -0,0 +1,18 @@
+\name{helper_split_number}
+\alias{helper_split_number}
+\title{Splits A Single House Number Into House Number And Additional Letter}
+\description{This is an internal function. Please use \code{\link{split_number}}}
+\usage{helper_split_number(x, debug = FALSE)
+}
+\arguments{
+  \item{x}{A character vector of length 1}
+  \item{debug}{If true, print(x)}
+}
+\value{
+A data.frame with two elements
+  \item{Hausnummer}{Extracted house number}
+  \item{Zusatz}{Extracted extra letter}
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{split_number}}}
+
diff --git a/KOR.addrlink/man/l1score.Rd b/KOR.addrlink/man/l1score.Rd
new file mode 100644
index 0000000..a60a7f3
--- /dev/null
+++ b/KOR.addrlink/man/l1score.Rd
@@ -0,0 +1,18 @@
+\name{l1score}
+\alias{l1score}
+\title{Calculate L1-Distance Based Scores}
+\description{
+Reversed normalized absolute distance from zero. 
+}
+\usage{l1score(x)}
+\arguments{
+  \item{x}{A numeric vector}
+}
+\details{
+\deqn{1 - \frac{|x|}{\text{max}\{1, |x|\}}}{1 - |x| / (max(1, |x|)}
+}
+\value{
+A numeric vector of the same length as x
+}
+\author{Daniel Schürmann}
+
diff --git a/KOR.addrlink/man/match_number.Rd b/KOR.addrlink/man/match_number.Rd
new file mode 100644
index 0000000..b25e48c
--- /dev/null
+++ b/KOR.addrlink/man/match_number.Rd
@@ -0,0 +1,39 @@
+\name{match_number}
+\alias{match_number}
+\title{Find Best House Number Match Within Given Street}
+\description{This is an internal function. Please use \code{\link{addrlink}}}
+\usage{match_number(record, Adressen, weights = c(0.9, 0.1))}
+
+\arguments{
+  \item{record}{data.frame with one row and three columns (Strasse, Hausnummer, Hausnummernzusatz)}
+  \item{Adressen}{data.frame of all valid addresses (same columns as record data.frame)}
+  \item{weights}{The weighing factors between house number and additional letter}
+}
+
+\details{
+If no house number and no additional letter is provided, a random address in 
+the given street is selected (qscore = 0).
+
+If only an additional letter but no house number is given and the letter is unique, 
+returns the corresponding record (qscore = 0.05). Otherwise returns a random one 
+as mentioned above (qscore = 0). 
+
+If no additional letter, but house number is provided and the maximum distance to 
+a valid house number is 4, return the closest match as calculated by 
+\code{\link{l1score}} (qscore is the result of l1score). Otherwise a random record 
+is returned (qscore = 0). 
+
+If additional letter and house number are available and the house number distance 
+is smaller then 4, calculates the l1scores of the house number distance and 
+addional letters distance and selects the best match (qscore is the sum of both 
+weighted l1scores). Otherwise a random record is selected (qscore = 0). 
+}
+\value{
+A data.frame
+  \item{qscore}{The quality score of the match}
+  \item{Strasse}{matched street}
+  \item{Hausnummer}{matched house number}
+  \item{Hausnummernzusatz}{matched additional letter}
+}
+\author{Daniel Schürmann}
+\seealso{\code{\link{addrlink}}}
+\ No newline at end of file
diff --git a/KOR.addrlink/man/sanitize_street.Rd b/KOR.addrlink/man/sanitize_street.Rd
new file mode 100644
index 0000000..cce0ce5
--- /dev/null
+++ b/KOR.addrlink/man/sanitize_street.Rd
@@ -0,0 +1,28 @@
+\name{sanitize_street}
+\alias{sanitize_street}
+\title{Clean Steet Names And Make Them Mergeable}
+\description{
+This function replaces Umlauts, expands "str" to "strasse", 
+transliterates all non-ascii characters, removes punctuation and converts 
+to lower case.
+}
+\usage{sanitize_street(x)}
+
+\arguments{
+  \item{x}{A character vector containing the steet names}
+}
+\details{
+This is an internal function used in \code{addrlink}. 
+Make sure house numbers have already been extracted. 
+Use \code{split_number} or \code{split_address} for that. 
+Only steet names can go into \code{sanitize_street}. 
+}
+\value{
+A character vector of the same length as x containing the 
+sanitized street names. }
+\author{Daniel Schürmann}
+
+\seealso{
+\code{\link{split_address}}, \code{\link{split_number}}, \code{\link{addrlink}}
+}
+
diff --git a/KOR.addrlink/man/split_address.Rd b/KOR.addrlink/man/split_address.Rd
new file mode 100644
index 0000000..91e801d
--- /dev/null
+++ b/KOR.addrlink/man/split_address.Rd
@@ -0,0 +1,28 @@
+\name{split_address}
+\alias{split_address}
+\title{Split Adresses Into Street, House Number And Additional Letter}
+\description{
+This function takes a character vector where each element is made up from a concatenation of 
+street name, house number and possibly an additional letter and splits it into its parts. 
+}
+\usage{split_address(x, debug = FALSE)}
+\arguments{
+  \item{x}{A character vector}
+  \item{debug}{If true, all records will be printed to the console}
+}
+\details{
+If the function fails, consider using \code{debug = TRUE}. This will print the record, which caused the error. 
+Consider filing an issue on the linked git project (see DESCRIPTION).
+}
+\value{
+A data.frame with three columns
+  \item{Strasse}{A character column containing the extracted street names}
+  \item{Hausnummer}{House number}
+  \item{Hausnummernzusatz}{Additional letter}
+}
+\author{Daniel Schürmann}
+\note{For a more advanced, general purpose solution see libpostal.}
+\seealso{\code{\link{split_number}}}
+\examples{
+split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c"))
+}
diff --git a/KOR.addrlink/man/split_number.Rd b/KOR.addrlink/man/split_number.Rd
new file mode 100644
index 0000000..c8bf7a5
--- /dev/null
+++ b/KOR.addrlink/man/split_number.Rd
@@ -0,0 +1,27 @@
+\name{split_number}
+\alias{split_number}
+\title{Split house number into house number and additional letter}
+\description{
+This function takes a character vector where each element is made up from a concatenation of 
+house number and possibly an additional letter and splits is into its parts. 
+}
+\usage{split_number(x, debug = FALSE)}
+\arguments{
+  \item{x}{A character vector}
+  \item{debug}{If true, all records will be printed to the console}
+}
+\details{
+If the function fails, consider using \code{debug = TRUE}. This will print the record, which caused the error. 
+Consider filing an issue on the linked git project (see DESCRIPTION).
+}
+\value{
+A data.frame with two columns
+  \item{Hausnummer}{House number}
+  \item{Hausnummernzusatz}{Additional letter}
+}
+\author{Daniel Schürmann}
+\note{For a more advanced, general purpose solution see libpostal.}
+\seealso{\code{\link{split_address}}}
+\examples{
+split_number(c("8-9 a", "1-2", "100a-102c"))
+}
diff --git a/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save b/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save
new file mode 100644
index 0000000..b6c22d2
--- /dev/null
+++ b/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save
@@ -0,0 +1,81 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+  Natural language support but running in an English locale
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> pkgname <- "KOR.addrlink"
+> source(file.path(R.home("share"), "R", "examples-header.R"))
+> options(warn = 1)
+> library('KOR.addrlink')
+> 
+> base::assign(".oldSearch", base::search(), pos = 'CheckExEnv')
+> base::assign(".old_wd", base::getwd(), pos = 'CheckExEnv')
+> cleanEx()
+> nameEx("split_address")
+> ### * split_address
+> 
+> flush(stderr()); flush(stdout())
+> 
+> ### Name: split_address
+> ### Title: Split Adresses Into Street, House Number And Additional Letter
+> ### Aliases: split_address
+> 
+> ### ** Examples
+> 
+> split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c"))
+      Strasse Hausnummer Hausnummernzusatz
+1    Teststr.          8                 A
+2  Erster Weg          1              <NA>
+3 Ahornallee         102                 C
+> 
+> 
+> 
+> cleanEx()
+> nameEx("split_number")
+> ### * split_number
+> 
+> flush(stderr()); flush(stdout())
+> 
+> ### Name: split_number
+> ### Title: Split house number into house number and additional letter
+> ### Aliases: split_number
+> 
+> ### ** Examples
+> 
+> split_number(c("8-9 a", "1-2", "100a-102c"))
+  Hausnummer Hausnummernzusatz
+1          8              <NA>
+2          1              <NA>
+3        100                 a
+> 
+> 
+> 
+> ### * <FOOTER>
+> ###
+> cleanEx()
+> options(digits = 7L)
+> base::cat("Time elapsed: ", proc.time() - base::get("ptime", pos = 'CheckExEnv'),"\n")
+Time elapsed:  0.155 0.019 0.224 0.009 0.006 
+> grDevices::dev.off()
+null device 
+          1 
+> ###
+> ### Local variables: ***
+> ### mode: outline-minor ***
+> ### outline-regexp: "\\(> \\)?### [*]+" ***
+> ### End: ***
+> quit('no')
diff --git a/KOR.addrlink/tests/test_l1score.R b/KOR.addrlink/tests/test_l1score.R
new file mode 100644
index 0000000..6708f80
--- /dev/null
+++ b/KOR.addrlink/tests/test_l1score.R
@@ -0,0 +1 @@
+KOR.addrlink:::l1score(c(-4, -1.3, 0, NA, 5.1, 10))
diff --git a/KOR.addrlink/tests/test_l1score.Rout.save b/KOR.addrlink/tests/test_l1score.Rout.save
new file mode 100644
index 0000000..6e952b2
--- /dev/null
+++ b/KOR.addrlink/tests/test_l1score.Rout.save
@@ -0,0 +1,23 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink:::l1score(c(-4, -1.3, 0, NA, 5.1, 10))
+[1] 0.60 0.87 1.00   NA 0.49 0.00
+> 
+> proc.time()
+   user  system elapsed 
+  0.242   0.062   0.266 
diff --git a/KOR.addrlink/tests/test_sanitize_street.R b/KOR.addrlink/tests/test_sanitize_street.R
new file mode 100644
index 0000000..3543a6b
--- /dev/null
+++ b/KOR.addrlink/tests/test_sanitize_street.R
@@ -0,0 +1 @@
+KOR.addrlink:::sanitize_street(c("Binde-Strich-Strasse", "Teststr.", "A.-B.-C.-Str."))
diff --git a/KOR.addrlink/tests/test_sanitize_street.Rout.save b/KOR.addrlink/tests/test_sanitize_street.Rout.save
new file mode 100644
index 0000000..3bb9e3f
--- /dev/null
+++ b/KOR.addrlink/tests/test_sanitize_street.Rout.save
@@ -0,0 +1,23 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink:::sanitize_street(c("Binde-Strich-Strasse", "Teststr.", "A.-B.-C.-Str."))
+[1] "bindestrichstrasse" "teststrasse"        "abcstrasse"        
+> 
+> proc.time()
+   user  system elapsed 
+  0.259   0.059   0.289 
diff --git a/KOR.addrlink/tests/test_split_address.R b/KOR.addrlink/tests/test_split_address.R
new file mode 100644
index 0000000..7b7db48
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_address.R
@@ -0,0 +1,2 @@
+KOR.addrlink::split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c", 
+"Stra\u00dfe des 1. Mai 10/12", "Emmerich-K\u00e1lm\u00e1n-Stra\u00dfe 15"))
diff --git a/KOR.addrlink/tests/test_split_address.Rout.save b/KOR.addrlink/tests/test_split_address.Rout.save
new file mode 100644
index 0000000..65ab67b
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_address.Rout.save
@@ -0,0 +1,29 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink::split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c", 
++ "Stra\u00dfe des 1. Mai 10/12", "Emmerich-K\u00e1lm\u00e1n-Stra\u00dfe 15"))
+                 Strasse Hausnummer Hausnummernzusatz
+1               Teststr.          8                 A
+2             Erster Weg          1              <NA>
+3            Ahornallee         102                 C
+4      Straße des 1. Mai         10              <NA>
+5 Emmerich-Kálmán-Straße         15              <NA>
+> 
+> proc.time()
+   user  system elapsed 
+  0.277   0.053   0.299 
diff --git a/KOR.addrlink/tests/test_split_number.R b/KOR.addrlink/tests/test_split_number.R
new file mode 100644
index 0000000..81d01eb
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_number.R
@@ -0,0 +1 @@
+KOR.addrlink::split_number(c("8-9 a", "1-2", "100a-102c", "2/3", "25/27a", "3 5"))
diff --git a/KOR.addrlink/tests/test_split_number.Rout.save b/KOR.addrlink/tests/test_split_number.Rout.save
new file mode 100644
index 0000000..766e419
--- /dev/null
+++ b/KOR.addrlink/tests/test_split_number.Rout.save
@@ -0,0 +1,29 @@
+
+R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
+Copyright (C) 2024 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> KOR.addrlink::split_number(c("8-9 a", "1-2", "100a-102c", "2/3", "25/27a", "3 5"))
+  Hausnummer Hausnummernzusatz
+1          8              <NA>
+2          1              <NA>
+3        100                 a
+4          2              <NA>
+5         25              <NA>
+6          3              <NA>
+> 
+> proc.time()
+   user  system elapsed 
+  0.246   0.054   0.274 
diff --git a/KOR.addrlink/vignettes/Example.Rnw b/KOR.addrlink/vignettes/Example.Rnw
new file mode 100644
index 0000000..aef9c2e
--- /dev/null
+++ b/KOR.addrlink/vignettes/Example.Rnw
@@ -0,0 +1,143 @@
+\documentclass{article}
+%\VignetteIndexEntry{Example}
+\usepackage[utf8]{inputenc}
+\begin{document}
+\SweaveOpts{concordance=TRUE}
+\title{Using KOR.addrlink}
+\author{Daniel Sch\"urmann}
+\date{February 29, 2024}
+\maketitle
+
+\section{Introduction}
+
+Consider a data set with semi-structured address data, e.g. street and house number as a concatenated string, 
+wrongly spelled street names or non-existing house numbers. This data set (referred to as df\_match) should be 
+mapped to a complete list of valid addresses within the given municipality. The latter data set is 
+called df\_ref and may include further information like coordinates of addresses or district information.
+KOR.addrlink tries to solve this problem specifically for German municipalities as the package is specialized 
+on German address schemes. 
+
+\section{Reference data}
+
+First, a complete list of reference addresses (df\_ref) is needed. An example 
+data.fame named "Adressen" is shown below. 
+
+<<>>=
+library(KOR.addrlink)
+Adressen[c(sample(which(is.na(Adressen$HNRZ)), 4), 
+	sample(which(!is.na(Adressen$HNRZ)), 2)),]
+@
+
+The columns used for the matching procedure are STRNAME (street name), HNR (house number) 
+and HNRZ (additional letter). This vignette illustrates the merging workflow on two sample data sets called df1 and df2. 
+
+\section{Example 1}
+df1 has address information in columns gross\_strasse and housnr. 
+The columns Var1 and Var2 provide non-address related information about 
+the individuals. Row 1183 shows that the column hausnr needs to be split 
+into house number and additional letter before addresses can be matched. 
+The function split\_number is provided for that task. 
+
+<<>>=
+df1[1180:(1183+6),]
+@
+
+split\_number takes hausnr and creates a data.frame with columns "Hausnummer" 
+(house number) and "Hausnummernzusatz" (additional letter). 
+
+<<>>=
+df1 <- cbind(df1, split_number(df1$hausnr))
+df1[1180:(1183+6),]
+@
+
+addrlink merges the two data sets. For both data sets, the columns referring 
+to steet name, house number and additional letter need to be specified 
+in exactly that order (parameter col\_ref and col\_match). 
+
+<<>>=
+# column hausnr is no longer needed
+df1 <- within(df1, rm(hausnr))
+df1_matched <- addrlink(df_ref = Adressen, 
+	col_ref = c("STRNAME", "HNR", "HNRZ"), 
+	df_match = df1, 
+	col_match = c("gross_strasse", "Hausnummer", "Hausnummernzusatz"))
+@
+
+The result is a list with two data.frames
+\begin{itemize}
+\item ret: The merged data set
+\item QA: Indicators showing the match quality
+\end{itemize}
+
+<<>>=
+head(df1_matched$ret)
+table(df1_matched$QA$qAddress)
+@
+
+qAdress states the stage within the matching procedure that yielded the match. 
+Out of the 10000 records, 9670 could be merged directly. 72 had a valid street 
+name, but an invalid house number. 157 records had (possibly) misspelled street 
+names and 101 records could not be matched at all. 
+
+\section{Example 2}
+
+The second data set has a single column "Adresse", which includes street names 
+and house numbers. Thus, this column needs to be split by the function 
+split\_address. 
+
+<<>>=
+head(within(df2, Adresse <- trimws(Adresse)))
+@
+
+split\_number creates a data.frame with columns "Strasse" (street) "Hausnummer" 
+(house number) and "Hausnummernzusatz" (additional letter) from the column 
+"Adresse". 
+
+<<>>=
+df2 <- cbind(df2, split_address(df2$Adresse))
+within(df2, Adresse <- trimws(Adresse))[23:(23+6),]
+@
+
+Again, addrlink merges the two data sets. The parameter fuzzy\_threshold 
+sets the threshold for fuzzy matching of misspelled street names. A value 
+of 1 means no fuzzy matching and 0 means forced fuzzy matches for all records. 
+If a steet name could be matched, but the provided house number does not exist, addrlink 
+may randomly assign a valid house number to that record. A seed is always set 
+to ensure reproducibility. Customization is possible via the parameter seed. 
+
+<<>>=
+# column Adresse is no longer needed
+df2 <- within(df2, rm(Adresse))
+df2_matched <- addrlink(df_ref = Adressen, 
+	col_ref = c("STRNAME", "HNR", "HNRZ"), 
+	df_match = df2, 
+	col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"), 
+	fuzzy_threshold = .9, seed = 1234)
+@
+
+<<>>=
+head(df2_matched$ret)
+table(df2_matched$QA$qAddress)
+@
+
+49 records had invalid house numbers and one record was matched by 
+fuzzy matching. This record can be inspected in detail. 
+
+<<>>=
+id <- which(df2_matched$QA$qAddress == 3) 
+df2_matched$ret[id,]
+df2_matched$QA[id,]
+@
+
+In this case the fuzzy matching procedure was most likely correct 
+(St.-Georg-Str. matched SANKT-GEORG-STRA{\ss}E).
+
+The number of cases with correct street name and randomly assigned house 
+numbers is 10.
+
+<<>>=
+sum(df2_matched$QA$qscore == 0) 
+@
+
+
+\end{document}
diff --git a/README.html b/README.html
new file mode 100644
index 0000000..1494e7f
--- /dev/null
+++ b/README.html
@@ -0,0 +1,18 @@
+<h1 id="kor.addrlink">KOR.addrlink</h1>
+<h3 id="beschreibung">Beschreibung</h3>
+<p>Die Ausgangssituation ist eine Datenlieferung mit Adressdaten als
+Textfeld. Diese sollen anhand einer vorhandenen Adressdatenbank
+geokodiert werden (z.B. den UBZ zugeordnet werden)</p>
+<h2 id="installation">Installation</h2>
+<p>install.packages(“KOR.addrlink”)</p>
+<h2 id="benutzung">Benutzung</h2>
+<p>Am besten in die Vignette gucken. Da sind zwei verschiedene Beispiele
+drin.</p>
+<h2 id="hilfe">Hilfe</h2>
+<p>d.schuermann@2718282.net</p>
+<h2 id="beitragen">Beitragen</h2>
+<p>Issues und pull requests sind gerne gesehen!</p>
+<h2 id="autor">Autor</h2>
+<p>Daniel Schürmann</p>
+<h2 id="lizenz">Lizenz</h2>
+<p>GPL-3</p>
+\ No newline at end of file
author	Daniel Schürmann <d.schuermann@2718282.net>	2024-03-10 12:22:13 +0100
committer	Daniel Schürmann <d.schuermann@2718282.net>	2024-03-10 12:22:13 +0100
commit	e02f41b9b1dc3c45f6626e8b01fee2acb5b905d4 (patch)
tree	8fa7087b5f5fcfe5513c97ef7fa6f0fdb9edf614
download	KOR.addrlink-390f76d388fa183fdf5448219525b79cefe909f6.tar.gz KOR.addrlink-390f76d388fa183fdf5448219525b79cefe909f6.zip