diff options
| author | Daniel Schürmann <d.schuermann@2718282.net> | 2024-03-10 12:22:13 +0100 |
|---|---|---|
| committer | Daniel Schürmann <d.schuermann@2718282.net> | 2024-03-10 12:22:13 +0100 |
| commit | e02f41b9b1dc3c45f6626e8b01fee2acb5b905d4 (patch) | |
| tree | 8fa7087b5f5fcfe5513c97ef7fa6f0fdb9edf614 | |
| download | KOR.addrlink-1.0.1.tar.gz KOR.addrlink-1.0.1.zip | |
36 files changed, 940 insertions, 0 deletions
diff --git a/KOR.addrlink/DESCRIPTION b/KOR.addrlink/DESCRIPTION new file mode 100644 index 0000000..8761e79 --- /dev/null +++ b/KOR.addrlink/DESCRIPTION @@ -0,0 +1,21 @@ +Package: KOR.addrlink +Type: Package +Title: Matching Address Data to Reference Index +Version: 1.0.1 +Date: 2024-03-02 +Author: Daniel Schürmann [aut, cre] +Authors@R: person("Daniel", "Schürmann", role = c("aut", "cre"), email = "d.schuermann@2718282.net") +Maintainer: Daniel Schürmann <d.schuermann@2718282.net> +Depends: R (>= 3.4) +Imports: stringdist, stringi +LazyData: true +Description: Matches a data set with semi-structured address data, + e.g., street and house number as a concatenated string, + wrongly spelled street names or non-existing house numbers to a + reference index. The methods are specifically designed for German + municipalities ('KOR'-community) and German address schemes. +License: GPL-3 +Encoding: UTF-8 +URL: https://git-kor.stadtdo.de +BugReports: https://git-kor.stadtdo.de/stadt-dortmund/adressdaten/-/issues + diff --git a/KOR.addrlink/NAMESPACE b/KOR.addrlink/NAMESPACE new file mode 100644 index 0000000..6a6a065 --- /dev/null +++ b/KOR.addrlink/NAMESPACE @@ -0,0 +1,4 @@ +export("addrlink", "split_address", "split_number") +importFrom("utils", "head") +importFrom("stringi", "stri_replace_all_regex", "stri_trans_general", "stri_trans_nfc") +importFrom("stringdist", "stringsimmatrix") diff --git a/KOR.addrlink/R/addrlink.R b/KOR.addrlink/R/addrlink.R new file mode 100644 index 0000000..49deec8 --- /dev/null +++ b/KOR.addrlink/R/addrlink.R @@ -0,0 +1,111 @@ +addrlink <- +function(df_ref, df_match, + col_ref = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + fuzzy_threshold = .9, seed = 1234){ + + stopifnot(is.data.frame(df_ref), is.data.frame(df_match), + is.vector(col_ref), is.character(col_ref), + is.vector(col_match), is.character(col_match)) + stopifnot(unique(colnames(df_ref)) == colnames(df_ref)) + stopifnot(unique(colnames(df_match)) == colnames(df_match)) + stopifnot(any(!is.na(col_ref), !is.na(col_match))) + stopifnot(length(col_ref) == 3, length(col_match) == 3) + stopifnot(col_ref %in% colnames(df_ref)) + stopifnot(col_match %in% colnames(df_match)) + stopifnot(fuzzy_threshold < 1, fuzzy_threshold > 0) + stopifnot(seed > 0) + set.seed(seed) + + Adressen <- df_ref[, col_ref] + stopifnot(is.character(Adressen[,1]), is.numeric(Adressen[,2]), is.character(Adressen[,3])) + stopifnot(nrow(unique(Adressen)) == nrow(Adressen)) + stopifnot(nrow(Adressen) > 0) + colnames(Adressen) <- c("Strasse", "Hausnummer", "Hausnummernzusatz") + Adressen$id.addr <- 1:nrow(Adressen) + Adressen$Strasse <- sanitize_street(Adressen$Strasse) + Adressen$Hausnummernzusatz <- tolower(Adressen$Hausnummernzusatz) + + df <- df_match[, col_match] + stopifnot(is.character(df[,1]), is.numeric(df[,2]), + is.character(df[,3])) + stopifnot(nrow(df) > 0) + colnames(df) <- c("Strasse", "Hausnummer", "Hausnummernzusatz") + df$id.df <- 1:nrow(df) + df$Strasse <- sanitize_street(df$Strasse) + df$Hausnummernzusatz <- tolower(df$Hausnummernzusatz) + + # first pass (direct matches) + fp <- merge(x = df, y = Adressen, + by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + incomparables = NULL) + if(nrow(fp) > 0){ + fp$qAddress <- 1 + fp$qscore <- 1 + } else fp <- data.frame(Strasse = character(), Hausnummer = numeric(), + Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), + qAddress = numeric(), qscore = numeric()) + + # second pass (street correct) + tmp <- df[!(df$id.df %in% fp$id.df) & !is.na(df$Strasse),] + sp <- merge(x = tmp, y = unique(Adressen[, "Strasse", drop = FALSE]), + by.x = c("Strasse"), + by.y = c("Strasse"), + incomparables = NULL) + if(nrow(sp) > 0){ + sp <- cbind(id.df = sp$id.df, + do.call(rbind, apply(X = sp, MARGIN = 1, + FUN = match_number, Adressen = Adressen))) + sp$qAddress <- 2 + sp <- merge(x = sp, y = Adressen, + by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + incomparables = NULL) + } else sp <- data.frame(Strasse = character(), Hausnummer = numeric(), + Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), + qAddress = numeric(), qscore = numeric()) + + ## third pass (fuzzy matches) + tp <- df[!(df$id.df %in% c(fp$id.df, sp$id.df)) & !is.na(df$Strasse),] + uSTR <- unique(Adressen$Strasse) + tmp <- stringdist::stringsimmatrix(a = tp$Strasse, b = uSTR, method = "jw", + nthread = max(1, floor(parallel::detectCores() / 2))) + threshold <- which(apply(tmp, MARGIN = 1, FUN = max) > fuzzy_threshold) + if(length(threshold) > 0){ + tp$Strasse[threshold] <- uSTR[unlist(apply(tmp, MARGIN = 1, FUN = which.max))[threshold]] + tp <- merge(x = tp, y = unique(Adressen[, "Strasse", drop = FALSE]), + by.x = c("Strasse"), + by.y = c("Strasse"), + incomparables = NULL) + tp <- cbind(id.df = tp$id.df, + do.call(rbind, apply(X = tp, MARGIN = 1, + FUN = match_number, Adressen = Adressen))) + tp$qscore <- tp$qscore * apply((tmp[threshold, , drop = FALSE]), 1, max) + tp$qAddress <- 3 + tp <- merge(x = tp, y = Adressen, + by.x = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + by.y = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + incomparables = NULL) + } else tp <- data.frame(Strasse = character(), Hausnummer = numeric(), + Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), + qAddress = numeric(), qscore = numeric()) + + # no match + nomatch <- df[!(df$id.df %in% c(fp$id.df, sp$id.df, tp$id.df)),] + if(nrow(nomatch) > 0){ + nomatch$Strasse <- NA + nomatch$Hausnummer <- NA + nomatch$Hausnummernzusatz <- NA + nomatch$id.addr <- NA + nomatch$qAddress <- 4 + nomatch$qscore <- 0 + } else nomatch <- data.frame(Strasse = character(), Hausnummer = numeric(), + Hausnummernzusatz = character(), id.addr = numeric(), id.df = numeric(), + qAddress = numeric(), qscore = numeric()) + + # results + res <- rbind(fp, sp, tp, nomatch) + ret <- cbind(df_match[res$id.df,], df_ref[res$id.addr,]) + return(list(ret = ret, QA = res[, c("qAddress", "qscore")])) +} diff --git a/KOR.addrlink/R/helper_split_address.R b/KOR.addrlink/R/helper_split_address.R new file mode 100644 index 0000000..e750ae8 --- /dev/null +++ b/KOR.addrlink/R/helper_split_address.R @@ -0,0 +1,33 @@ +helper_split_address <- +function(x, debug = FALSE){ + if(debug) print(x) + x <- trimws(x) + x_split <- unlist(strsplit(x, '')) + num.idx <- which(x_split %in% as.character(0:9)) + if(length(num.idx) == 0){ return(list(strasse = x, hnr = NA, hnrz = NA)) } + max.num.idx <- max(num.idx) + num.extra <- which(x_split %in% c(" ", "-", "/")) + num.extra <- num.extra[min(num.idx) < num.extra & max.num.idx > num.extra] + num.idx <- c(num.idx, num.extra) + idx.subs <- lapply(min(1, (max.num.idx - 7)):max.num.idx, function(x) x:max.num.idx) + idx.hnr <- idx.subs[[min(which(unlist(lapply(idx.subs, function(y) all(y %in% num.idx)))))]] + hnr <- trimws(substr(x, min(idx.hnr), max(idx.hnr))) + if(grepl("-", hnr)){ + hnr <- as.numeric(unlist(strsplit(hnr, "-"))) + hnr <- head(hnr[!is.na(hnr)], 1) + } + if(grepl(" ", hnr)){ + hnr <- as.numeric(unlist(strsplit(hnr, " "))) + hnr <- head(hnr[!is.na(hnr)], 1) + } + if(grepl("/", hnr)){ + hnr <- as.numeric(unlist(strsplit(hnr, "/"))) + hnr <- head(hnr[!is.na(hnr)], 1) + } else { + hnr <- as.numeric(hnr)} + hnrz <- toupper(substr(trimws(substr(x, max(idx.hnr) + 1, nchar(x))), 1, 1)) + if(nchar(hnrz) == 0) hnrz <- NA + strasse <- trimws(substr(x, 1, min(idx.hnr) - 1)) + strasse <- sub("[[:digit:]]+[a-zA-Z]$", "", strasse) + return(list(strasse = strasse, hnr = hnr, hnrz = hnrz)) +} diff --git a/KOR.addrlink/R/helper_split_number.R b/KOR.addrlink/R/helper_split_number.R new file mode 100644 index 0000000..c19e89d --- /dev/null +++ b/KOR.addrlink/R/helper_split_number.R @@ -0,0 +1,25 @@ +helper_split_number <- +function(x, debug = FALSE){ + if(debug) print(x) + x <- stringi::stri_replace_all_regex(str = x, + pattern = c("-", "/", "\\s+"), replace = c(" ", " ", " "), + vectorize_all = FALSE) + x <- trimws(x) + if(nchar(x) == 0){ return(data.frame(Hausnummer = NA, Zusatz = NA)) } + x_split <- unlist(strsplit(x, '')) + x_start <- head(which(x_split %in% as.character(0:9)), 1) + x <- substr(x, x_start, nchar(x)) + x_split <- unlist(strsplit(x, '')) + if(" " %in% x_split){ + x <- strsplit(x, ' ')[[1]][1] + x_split <- unlist(strsplit(x, '')) + } + idx <- suppressWarnings(as.numeric(x_split)) + idx <- !is.na(idx) + idx_rle <- rle(idx) + hausnr <- as.numeric(substr(x, 1, head(idx_rle$length, 1))) + if(hausnr == ""){ hausnr <- NA } + zusatz <- substr(x, head(idx_rle$length, 1) + 1, head(idx_rle$length, 1) + 1) + if(zusatz == ""){ zusatz <- NA } + return(data.frame(Hausnummer = hausnr, Zusatz = zusatz)) +} diff --git a/KOR.addrlink/R/l1score.R b/KOR.addrlink/R/l1score.R new file mode 100644 index 0000000..86154ed --- /dev/null +++ b/KOR.addrlink/R/l1score.R @@ -0,0 +1,8 @@ +l1score <- +function(x){ + stopifnot(is.vector(x) & is.numeric(x)) + if(sum(is.na(x)) == length(x)){return(rep(1, length(x)))} + stopifnot(is.numeric(x)) + x <- abs(x) + 1 - x / max(c(1, x), na.rm = TRUE) +} diff --git a/KOR.addrlink/R/match_number.R b/KOR.addrlink/R/match_number.R new file mode 100644 index 0000000..34e2ade --- /dev/null +++ b/KOR.addrlink/R/match_number.R @@ -0,0 +1,52 @@ +match_number <- +function(record, Adressen, weights = c(.9, .1)){ + valid <- Adressen[Adressen$Strasse == record[["Strasse"]], c("Hausnummer", "Hausnummernzusatz")] + if(is.na(record[["Hausnummer"]]) & is.na(record[["Hausnummernzusatz"]])){ #no info + return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),])) + } + if(is.na(record[["Hausnummer"]]) & !is.na(record[["Hausnummernzusatz"]])){ #no hnr, but hnrz + zusatz <- l1score(match(record[["Hausnummernzusatz"]], LETTERS) - + match(valid$Hausnummernzusatz, LETTERS)) * weights[2] + val <- max(zusatz, na.rm = TRUE) + ids <- which(zusatz == val) + if(length(ids) == 0){ + return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),])) + } + if(length(ids) == 1){ + return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[ids,]))} + return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[sample(ids, 1),])) + } + if(!is.na(record[["Hausnummer"]]) & is.na(record[["Hausnummernzusatz"]])){ #hnr, no hnrz + hausnr_diff <- as.numeric(record[["Hausnummer"]]) - as.numeric(valid$Hausnummer) + hausnr <- l1score(hausnr_diff) * weights[1] + if(min(abs(hausnr_diff), na.rm = TRUE) > 4){ + return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),])) + } + val <- max(hausnr, na.rm = TRUE) + ids <- which(hausnr == val) + if(length(ids) == 1){ + return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[ids,]))} + return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[sample(ids, 1),])) + } + hausnr_diff <- as.numeric(record[["Hausnummer"]]) - as.numeric(valid$Hausnummer) + hausnr <- l1score(hausnr_diff) * weights[1] + zusatz <- l1score(match(record[["Hausnummernzusatz"]], LETTERS) - + match(valid$Hausnummernzusatz, LETTERS)) * weights[2] + if(min(abs(hausnr_diff), na.rm = TRUE) > 4){#no hnr, but hnrz + val <- max(zusatz, na.rm = TRUE) + ids <- which(zusatz == val) + if(length(ids) == 0){ + return(cbind(qscore = 0, Strasse = record[["Strasse"]], valid[sample(1:nrow(valid), 1),])) + } + if(length(ids) == 1){ + return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[ids,]))} + return(cbind(qscore = 0.05, Strasse = record[["Strasse"]], valid[sample(ids, 1),])) + } + zusatz[is.na(zusatz)] <- - 0.05 + score <- hausnr + zusatz + val <- max(score, na.rm = TRUE) + ids <- which(score == val) + if(length(ids) == 1){ + return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[ids,]))} + return(cbind(qscore = val, Strasse = record[["Strasse"]], valid[sample(ids, 1),])) +}
\ No newline at end of file diff --git a/KOR.addrlink/R/sanitize_street.R b/KOR.addrlink/R/sanitize_street.R new file mode 100644 index 0000000..d7ff05f --- /dev/null +++ b/KOR.addrlink/R/sanitize_street.R @@ -0,0 +1,15 @@ +sanitize_street <- +function(x){ + stopifnot(is.character(x), is.vector(x)) + x <- tolower(x) + pattern <- c("\u00e4", "\u00fc", "\u00f6", "\u00df", "-", " ", "\\.", "str$") + replacement <- c("ae", "ue", "oe", "ss", "", "", "", "strasse") + x <- stringi::stri_replace_all_regex(str = x, + pattern = pattern, replace = replacement, + vectorize_all = FALSE) + x <- stringi::stri_trans_general(str = x, id = "Any-Latin;Latin-ASCII") + x <- stringi::stri_trans_nfc(str = x) + x <- stringi::stri_replace_all_regex(str = x, + pattern = "[[:punct:]]", replace = "", vectorize_all = FALSE) + return(x) +} diff --git a/KOR.addrlink/R/split_address.R b/KOR.addrlink/R/split_address.R new file mode 100644 index 0000000..4fc71d5 --- /dev/null +++ b/KOR.addrlink/R/split_address.R @@ -0,0 +1,10 @@ +split_address <- +function(x, debug = FALSE) { + stopifnot(is.character(x), is.vector(x)) + vec_split_address <- Vectorize(helper_split_address, vectorize.args = "x", + USE.NAMES = FALSE, SIMPLIFY = FALSE) + res <- vec_split_address(x, debug = debug) + return(data.frame(Strasse = unlist(lapply(res, '[[', 1)), + Hausnummer = unlist(lapply(res, '[[', 2)), + Hausnummernzusatz = unlist(lapply(res, '[[', 3)))) +} diff --git a/KOR.addrlink/R/split_number.R b/KOR.addrlink/R/split_number.R new file mode 100644 index 0000000..4f7e584 --- /dev/null +++ b/KOR.addrlink/R/split_number.R @@ -0,0 +1,16 @@ +split_number <- +function(x, debug = FALSE){ + stopifnot(is.character(x), is.vector(x)) + x <- trimws(x) + x[which(x == "0" | x == "NULL" | x == "")] <- NA + x_ready <- data.frame(Hausnummer = suppressWarnings(as.numeric(x)), + Hausnummernzusatz = NA) + ids <- which(!is.na(x) & is.na(x_ready$Hausnummer)) + vec_split_hnr <- Vectorize(helper_split_number, vectorize.args = "x", + USE.NAMES = FALSE, SIMPLIFY = FALSE) + res <- vec_split_hnr(x[ids], debug = debug) + res <- data.frame(Hausnummer = unlist(lapply(res, '[[', 1)), + Hausnummernzusatz = unlist(lapply(res, '[[', 2))) + x_ready[ids,] <- res + return(x_ready) +} diff --git a/KOR.addrlink/data/Adressen.RData b/KOR.addrlink/data/Adressen.RData Binary files differnew file mode 100644 index 0000000..af08d1e --- /dev/null +++ b/KOR.addrlink/data/Adressen.RData diff --git a/KOR.addrlink/data/df1.RData b/KOR.addrlink/data/df1.RData Binary files differnew file mode 100644 index 0000000..032d1e1 --- /dev/null +++ b/KOR.addrlink/data/df1.RData diff --git a/KOR.addrlink/data/df2.RData b/KOR.addrlink/data/df2.RData Binary files differnew file mode 100644 index 0000000..41f7f4d --- /dev/null +++ b/KOR.addrlink/data/df2.RData diff --git a/KOR.addrlink/man/Adressen.Rd b/KOR.addrlink/man/Adressen.Rd new file mode 100644 index 0000000..b04cc94 --- /dev/null +++ b/KOR.addrlink/man/Adressen.Rd @@ -0,0 +1,21 @@ +\name{Adressen} +\docType{data} +\alias{Adressen} +\title{Address data from the city of Dortmund} +\description{ + This data set gives all the addresses in the city of Dortmund. +} +\usage{Adressen} +\format{A data.frame + \tabular{lll}{ + STRNAME \tab character \tab street name\cr + STRSL \tab numeric \tab street number\cr + HNR \tab numeric \tab house number\cr + HNRZ \tab character \tab additional letter\cr + RW \tab numeric \tab longitude \cr + HW \tab numeric \tab latitude \cr + UBZ \tab numeric \tab subdistrict number + } +} +\source{\url{https://open-data.dortmund.de}} +\keyword{datasets}
\ No newline at end of file diff --git a/KOR.addrlink/man/KOR.addrlink-package.Rd b/KOR.addrlink/man/KOR.addrlink-package.Rd new file mode 100644 index 0000000..9885e03 --- /dev/null +++ b/KOR.addrlink/man/KOR.addrlink-package.Rd @@ -0,0 +1,15 @@ +\name{KOR.addrlink-package} +\alias{KOR.addrlink-package} +\alias{KOR.addrlink} +\docType{package} +\title{KOR.addrlink} +\description{Geocode address data from German municipalities} +\details{ +\itemize{ + \item \code{\link{split_address}} Splits strings into street, house number and addional letter + \item \code{\link{split_number}} Splits strings into house number and addional letter + \item \code{\link{addrlink}} Matches splitted address data to reference table +} +Matching is based on street name, house number and additional letter. +} +\author{Daniel Schürmann} diff --git a/KOR.addrlink/man/addrlink.Rd b/KOR.addrlink/man/addrlink.Rd new file mode 100644 index 0000000..099c1cc --- /dev/null +++ b/KOR.addrlink/man/addrlink.Rd @@ -0,0 +1,47 @@ +\name{addrlink} +\alias{addrlink} +\title{Merge Data To Reference Index} +\description{ +Takes two data.frames with address data and merges them together. +} +\usage{ +addrlink(df_ref, df_match, +col_ref = c("Strasse", "Hausnummer", "Hausnummernzusatz"), +col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"), +fuzzy_threshold = 0.9, seed = 1234) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{df_ref}{data.frame with address references} + \item{df_match}{data.frame with addresses to be matched} + \item{col_ref}{character vector of length three, naming the df_ref columns which contain the steet names, house numbers and additional letters (in that order)} + \item{col_match}{character vector of length three, naming the df_match columns which contain the steet names, house numbers and additional letters (in that order)} + \item{fuzzy_threshold}{The threshold used for fuzzy matching street names} + \item{seed}{Seed for random numbers} +} +\details{ +The matching is done in four stages. + +\bold{Stage 1} (qAdress = 1). This is an exact match (highest quality, qscore = 1) + +\bold{Stage 2} (qAdress = 2). Exact match on street name, but no valid house +number could be found. Be aware that random house numbers might be used. +Consider setting your own seed. qscore indicates the match quality. +See \code{\link{match_number}} for details. + +\bold{Stage 3} (qAdress = 3). No exact match on street name could be found. +Street names are fuzzy matched. The method "jw" (Jaro-Winkler distance) from +package stringdist is used (see stringdist-metrics). If 1 - [Jaro-Winkler distance] +is greater than fuzzy_threshold, a match is assumed. The highest score is +taken and house number matching is done as outlined in Stage 2. +qscore is fuzzy_score*[house number score]. + +\bold{Stage 4} (qAdress = 4). No match (qscore = 0) +} +\value{ +A list + \item{ret}{The merged dataset} + \item{QA}{The quality markers (qAdress and qscore)} +} +\author{Daniel Schürmann} +\seealso{\code{\link{split_address}}, \code{\link{split_number}}}
\ No newline at end of file diff --git a/KOR.addrlink/man/df1.Rd b/KOR.addrlink/man/df1.Rd new file mode 100644 index 0000000..cc56c68 --- /dev/null +++ b/KOR.addrlink/man/df1.Rd @@ -0,0 +1,18 @@ +\name{df1} +\docType{data} +\alias{df1} +\title{Example dataset 1} +\description{ + This dataset contains separate street and house number information. +} +\usage{df1} +\format{A data.frame + \tabular{lll}{ + gross_strasse \tab character \tab street names\cr + hausnr \tab character \tab house number and additional letter\cr + Var1 \tab numeric \tab Variable 1\cr + Var2 \tab character \tab Variable 2 + } +} +\source{Dortmunder Statistik} +\keyword{datasets}
\ No newline at end of file diff --git a/KOR.addrlink/man/df2.Rd b/KOR.addrlink/man/df2.Rd new file mode 100644 index 0000000..0937b62 --- /dev/null +++ b/KOR.addrlink/man/df2.Rd @@ -0,0 +1,17 @@ +\name{df2} +\docType{data} +\alias{df2} +\title{Example dataset 2} +\description{ + This dataset contains concatenated street and house number information. +} +\usage{df2} +\format{A data.frame + \tabular{lll}{ + Adresse \tab character \tab street name, house number and addional letter\cr + Var1 \tab numeric \tab Variable 1\cr + Var2 \tab character \tab Variable 2 + } +} +\source{Dortmunder Statistik} +\keyword{datasets}
\ No newline at end of file diff --git a/KOR.addrlink/man/helper_split_address.Rd b/KOR.addrlink/man/helper_split_address.Rd new file mode 100644 index 0000000..bc87965 --- /dev/null +++ b/KOR.addrlink/man/helper_split_address.Rd @@ -0,0 +1,18 @@ +\name{helper_split_address} +\alias{helper_split_address} +\title{Splits A Single Address Into Street, House Number And Additional Letter} +\description{This is an internal function. Please use \code{\link{split_address}}} +\usage{helper_split_address(x, debug = FALSE) +} +\arguments{ + \item{x}{A character vector of length 1} + \item{debug}{If true, print(x)} +} +\value{ +A list with three elements + \item{strasse}{Extracted street name} + \item{hnr}{Extracted house number} + \item{hnrz}{Extracted extra letter} +} +\author{Daniel Schürmann} +\seealso{\code{\link{split_address}}} diff --git a/KOR.addrlink/man/helper_split_number.Rd b/KOR.addrlink/man/helper_split_number.Rd new file mode 100644 index 0000000..570504c --- /dev/null +++ b/KOR.addrlink/man/helper_split_number.Rd @@ -0,0 +1,18 @@ +\name{helper_split_number} +\alias{helper_split_number} +\title{Splits A Single House Number Into House Number And Additional Letter} +\description{This is an internal function. Please use \code{\link{split_number}}} +\usage{helper_split_number(x, debug = FALSE) +} +\arguments{ + \item{x}{A character vector of length 1} + \item{debug}{If true, print(x)} +} +\value{ +A data.frame with two elements + \item{Hausnummer}{Extracted house number} + \item{Zusatz}{Extracted extra letter} +} +\author{Daniel Schürmann} +\seealso{\code{\link{split_number}}} + diff --git a/KOR.addrlink/man/l1score.Rd b/KOR.addrlink/man/l1score.Rd new file mode 100644 index 0000000..a60a7f3 --- /dev/null +++ b/KOR.addrlink/man/l1score.Rd @@ -0,0 +1,18 @@ +\name{l1score} +\alias{l1score} +\title{Calculate L1-Distance Based Scores} +\description{ +Reversed normalized absolute distance from zero. +} +\usage{l1score(x)} +\arguments{ + \item{x}{A numeric vector} +} +\details{ +\deqn{1 - \frac{|x|}{\text{max}\{1, |x|\}}}{1 - |x| / (max(1, |x|)} +} +\value{ +A numeric vector of the same length as x +} +\author{Daniel Schürmann} + diff --git a/KOR.addrlink/man/match_number.Rd b/KOR.addrlink/man/match_number.Rd new file mode 100644 index 0000000..b25e48c --- /dev/null +++ b/KOR.addrlink/man/match_number.Rd @@ -0,0 +1,39 @@ +\name{match_number} +\alias{match_number} +\title{Find Best House Number Match Within Given Street} +\description{This is an internal function. Please use \code{\link{addrlink}}} +\usage{match_number(record, Adressen, weights = c(0.9, 0.1))} + +\arguments{ + \item{record}{data.frame with one row and three columns (Strasse, Hausnummer, Hausnummernzusatz)} + \item{Adressen}{data.frame of all valid addresses (same columns as record data.frame)} + \item{weights}{The weighing factors between house number and additional letter} +} + +\details{ +If no house number and no additional letter is provided, a random address in +the given street is selected (qscore = 0). + +If only an additional letter but no house number is given and the letter is unique, +returns the corresponding record (qscore = 0.05). Otherwise returns a random one +as mentioned above (qscore = 0). + +If no additional letter, but house number is provided and the maximum distance to +a valid house number is 4, return the closest match as calculated by +\code{\link{l1score}} (qscore is the result of l1score). Otherwise a random record +is returned (qscore = 0). + +If additional letter and house number are available and the house number distance +is smaller then 4, calculates the l1scores of the house number distance and +addional letters distance and selects the best match (qscore is the sum of both +weighted l1scores). Otherwise a random record is selected (qscore = 0). +} +\value{ +A data.frame + \item{qscore}{The quality score of the match} + \item{Strasse}{matched street} + \item{Hausnummer}{matched house number} + \item{Hausnummernzusatz}{matched additional letter} +} +\author{Daniel Schürmann} +\seealso{\code{\link{addrlink}}}
\ No newline at end of file diff --git a/KOR.addrlink/man/sanitize_street.Rd b/KOR.addrlink/man/sanitize_street.Rd new file mode 100644 index 0000000..cce0ce5 --- /dev/null +++ b/KOR.addrlink/man/sanitize_street.Rd @@ -0,0 +1,28 @@ +\name{sanitize_street} +\alias{sanitize_street} +\title{Clean Steet Names And Make Them Mergeable} +\description{ +This function replaces Umlauts, expands "str" to "strasse", +transliterates all non-ascii characters, removes punctuation and converts +to lower case. +} +\usage{sanitize_street(x)} + +\arguments{ + \item{x}{A character vector containing the steet names} +} +\details{ +This is an internal function used in \code{addrlink}. +Make sure house numbers have already been extracted. +Use \code{split_number} or \code{split_address} for that. +Only steet names can go into \code{sanitize_street}. +} +\value{ +A character vector of the same length as x containing the +sanitized street names. } +\author{Daniel Schürmann} + +\seealso{ +\code{\link{split_address}}, \code{\link{split_number}}, \code{\link{addrlink}} +} + diff --git a/KOR.addrlink/man/split_address.Rd b/KOR.addrlink/man/split_address.Rd new file mode 100644 index 0000000..91e801d --- /dev/null +++ b/KOR.addrlink/man/split_address.Rd @@ -0,0 +1,28 @@ +\name{split_address} +\alias{split_address} +\title{Split Adresses Into Street, House Number And Additional Letter} +\description{ +This function takes a character vector where each element is made up from a concatenation of +street name, house number and possibly an additional letter and splits it into its parts. +} +\usage{split_address(x, debug = FALSE)} +\arguments{ + \item{x}{A character vector} + \item{debug}{If true, all records will be printed to the console} +} +\details{ +If the function fails, consider using \code{debug = TRUE}. This will print the record, which caused the error. +Consider filing an issue on the linked git project (see DESCRIPTION). +} +\value{ +A data.frame with three columns + \item{Strasse}{A character column containing the extracted street names} + \item{Hausnummer}{House number} + \item{Hausnummernzusatz}{Additional letter} +} +\author{Daniel Schürmann} +\note{For a more advanced, general purpose solution see libpostal.} +\seealso{\code{\link{split_number}}} +\examples{ +split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c")) +} diff --git a/KOR.addrlink/man/split_number.Rd b/KOR.addrlink/man/split_number.Rd new file mode 100644 index 0000000..c8bf7a5 --- /dev/null +++ b/KOR.addrlink/man/split_number.Rd @@ -0,0 +1,27 @@ +\name{split_number} +\alias{split_number} +\title{Split house number into house number and additional letter} +\description{ +This function takes a character vector where each element is made up from a concatenation of +house number and possibly an additional letter and splits is into its parts. +} +\usage{split_number(x, debug = FALSE)} +\arguments{ + \item{x}{A character vector} + \item{debug}{If true, all records will be printed to the console} +} +\details{ +If the function fails, consider using \code{debug = TRUE}. This will print the record, which caused the error. +Consider filing an issue on the linked git project (see DESCRIPTION). +} +\value{ +A data.frame with two columns + \item{Hausnummer}{House number} + \item{Hausnummernzusatz}{Additional letter} +} +\author{Daniel Schürmann} +\note{For a more advanced, general purpose solution see libpostal.} +\seealso{\code{\link{split_address}}} +\examples{ +split_number(c("8-9 a", "1-2", "100a-102c")) +} diff --git a/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save b/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save new file mode 100644 index 0000000..b6c22d2 --- /dev/null +++ b/KOR.addrlink/tests/Examples/KOR.addrlink-Ex.Rout.save @@ -0,0 +1,81 @@ + +R version 4.3.3 (2024-02-29) -- "Angel Food Cake" +Copyright (C) 2024 The R Foundation for Statistical Computing +Platform: x86_64-pc-linux-gnu (64-bit) + +R is free software and comes with ABSOLUTELY NO WARRANTY. +You are welcome to redistribute it under certain conditions. +Type 'license()' or 'licence()' for distribution details. + + Natural language support but running in an English locale + +R is a collaborative project with many contributors. +Type 'contributors()' for more information and +'citation()' on how to cite R or R packages in publications. + +Type 'demo()' for some demos, 'help()' for on-line help, or +'help.start()' for an HTML browser interface to help. +Type 'q()' to quit R. + +> pkgname <- "KOR.addrlink" +> source(file.path(R.home("share"), "R", "examples-header.R")) +> options(warn = 1) +> library('KOR.addrlink') +> +> base::assign(".oldSearch", base::search(), pos = 'CheckExEnv') +> base::assign(".old_wd", base::getwd(), pos = 'CheckExEnv') +> cleanEx() +> nameEx("split_address") +> ### * split_address +> +> flush(stderr()); flush(stdout()) +> +> ### Name: split_address +> ### Title: Split Adresses Into Street, House Number And Additional Letter +> ### Aliases: split_address +> +> ### ** Examples +> +> split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c")) + Strasse Hausnummer Hausnummernzusatz +1 Teststr. 8 A +2 Erster Weg 1 <NA> +3 Ahornallee 102 C +> +> +> +> cleanEx() +> nameEx("split_number") +> ### * split_number +> +> flush(stderr()); flush(stdout()) +> +> ### Name: split_number +> ### Title: Split house number into house number and additional letter +> ### Aliases: split_number +> +> ### ** Examples +> +> split_number(c("8-9 a", "1-2", "100a-102c")) + Hausnummer Hausnummernzusatz +1 8 <NA> +2 1 <NA> +3 100 a +> +> +> +> ### * <FOOTER> +> ### +> cleanEx() +> options(digits = 7L) +> base::cat("Time elapsed: ", proc.time() - base::get("ptime", pos = 'CheckExEnv'),"\n") +Time elapsed: 0.155 0.019 0.224 0.009 0.006 +> grDevices::dev.off() +null device + 1 +> ### +> ### Local variables: *** +> ### mode: outline-minor *** +> ### outline-regexp: "\\(> \\)?### [*]+" *** +> ### End: *** +> quit('no') diff --git a/KOR.addrlink/tests/test_l1score.R b/KOR.addrlink/tests/test_l1score.R new file mode 100644 index 0000000..6708f80 --- /dev/null +++ b/KOR.addrlink/tests/test_l1score.R @@ -0,0 +1 @@ +KOR.addrlink:::l1score(c(-4, -1.3, 0, NA, 5.1, 10)) diff --git a/KOR.addrlink/tests/test_l1score.Rout.save b/KOR.addrlink/tests/test_l1score.Rout.save new file mode 100644 index 0000000..6e952b2 --- /dev/null +++ b/KOR.addrlink/tests/test_l1score.Rout.save @@ -0,0 +1,23 @@ + +R version 4.3.3 (2024-02-29) -- "Angel Food Cake" +Copyright (C) 2024 The R Foundation for Statistical Computing +Platform: x86_64-pc-linux-gnu (64-bit) + +R is free software and comes with ABSOLUTELY NO WARRANTY. +You are welcome to redistribute it under certain conditions. +Type 'license()' or 'licence()' for distribution details. + +R is a collaborative project with many contributors. +Type 'contributors()' for more information and +'citation()' on how to cite R or R packages in publications. + +Type 'demo()' for some demos, 'help()' for on-line help, or +'help.start()' for an HTML browser interface to help. +Type 'q()' to quit R. + +> KOR.addrlink:::l1score(c(-4, -1.3, 0, NA, 5.1, 10)) +[1] 0.60 0.87 1.00 NA 0.49 0.00 +> +> proc.time() + user system elapsed + 0.242 0.062 0.266 diff --git a/KOR.addrlink/tests/test_sanitize_street.R b/KOR.addrlink/tests/test_sanitize_street.R new file mode 100644 index 0000000..3543a6b --- /dev/null +++ b/KOR.addrlink/tests/test_sanitize_street.R @@ -0,0 +1 @@ +KOR.addrlink:::sanitize_street(c("Binde-Strich-Strasse", "Teststr.", "A.-B.-C.-Str.")) diff --git a/KOR.addrlink/tests/test_sanitize_street.Rout.save b/KOR.addrlink/tests/test_sanitize_street.Rout.save new file mode 100644 index 0000000..3bb9e3f --- /dev/null +++ b/KOR.addrlink/tests/test_sanitize_street.Rout.save @@ -0,0 +1,23 @@ + +R version 4.3.3 (2024-02-29) -- "Angel Food Cake" +Copyright (C) 2024 The R Foundation for Statistical Computing +Platform: x86_64-pc-linux-gnu (64-bit) + +R is free software and comes with ABSOLUTELY NO WARRANTY. +You are welcome to redistribute it under certain conditions. +Type 'license()' or 'licence()' for distribution details. + +R is a collaborative project with many contributors. +Type 'contributors()' for more information and +'citation()' on how to cite R or R packages in publications. + +Type 'demo()' for some demos, 'help()' for on-line help, or +'help.start()' for an HTML browser interface to help. +Type 'q()' to quit R. + +> KOR.addrlink:::sanitize_street(c("Binde-Strich-Strasse", "Teststr.", "A.-B.-C.-Str.")) +[1] "bindestrichstrasse" "teststrasse" "abcstrasse" +> +> proc.time() + user system elapsed + 0.259 0.059 0.289 diff --git a/KOR.addrlink/tests/test_split_address.R b/KOR.addrlink/tests/test_split_address.R new file mode 100644 index 0000000..7b7db48 --- /dev/null +++ b/KOR.addrlink/tests/test_split_address.R @@ -0,0 +1,2 @@ +KOR.addrlink::split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c", +"Stra\u00dfe des 1. Mai 10/12", "Emmerich-K\u00e1lm\u00e1n-Stra\u00dfe 15")) diff --git a/KOR.addrlink/tests/test_split_address.Rout.save b/KOR.addrlink/tests/test_split_address.Rout.save new file mode 100644 index 0000000..65ab67b --- /dev/null +++ b/KOR.addrlink/tests/test_split_address.Rout.save @@ -0,0 +1,29 @@ + +R version 4.3.3 (2024-02-29) -- "Angel Food Cake" +Copyright (C) 2024 The R Foundation for Statistical Computing +Platform: x86_64-pc-linux-gnu (64-bit) + +R is free software and comes with ABSOLUTELY NO WARRANTY. +You are welcome to redistribute it under certain conditions. +Type 'license()' or 'licence()' for distribution details. + +R is a collaborative project with many contributors. +Type 'contributors()' for more information and +'citation()' on how to cite R or R packages in publications. + +Type 'demo()' for some demos, 'help()' for on-line help, or +'help.start()' for an HTML browser interface to help. +Type 'q()' to quit R. + +> KOR.addrlink::split_address(c("Teststr. 8-9 a", "Erster Weg 1-2", "Ahornallee 100a-102c", ++ "Stra\u00dfe des 1. Mai 10/12", "Emmerich-K\u00e1lm\u00e1n-Stra\u00dfe 15")) + Strasse Hausnummer Hausnummernzusatz +1 Teststr. 8 A +2 Erster Weg 1 <NA> +3 Ahornallee 102 C +4 Straße des 1. Mai 10 <NA> +5 Emmerich-Kálmán-Straße 15 <NA> +> +> proc.time() + user system elapsed + 0.277 0.053 0.299 diff --git a/KOR.addrlink/tests/test_split_number.R b/KOR.addrlink/tests/test_split_number.R new file mode 100644 index 0000000..81d01eb --- /dev/null +++ b/KOR.addrlink/tests/test_split_number.R @@ -0,0 +1 @@ +KOR.addrlink::split_number(c("8-9 a", "1-2", "100a-102c", "2/3", "25/27a", "3 5")) diff --git a/KOR.addrlink/tests/test_split_number.Rout.save b/KOR.addrlink/tests/test_split_number.Rout.save new file mode 100644 index 0000000..766e419 --- /dev/null +++ b/KOR.addrlink/tests/test_split_number.Rout.save @@ -0,0 +1,29 @@ + +R version 4.3.3 (2024-02-29) -- "Angel Food Cake" +Copyright (C) 2024 The R Foundation for Statistical Computing +Platform: x86_64-pc-linux-gnu (64-bit) + +R is free software and comes with ABSOLUTELY NO WARRANTY. +You are welcome to redistribute it under certain conditions. +Type 'license()' or 'licence()' for distribution details. + +R is a collaborative project with many contributors. +Type 'contributors()' for more information and +'citation()' on how to cite R or R packages in publications. + +Type 'demo()' for some demos, 'help()' for on-line help, or +'help.start()' for an HTML browser interface to help. +Type 'q()' to quit R. + +> KOR.addrlink::split_number(c("8-9 a", "1-2", "100a-102c", "2/3", "25/27a", "3 5")) + Hausnummer Hausnummernzusatz +1 8 <NA> +2 1 <NA> +3 100 a +4 2 <NA> +5 25 <NA> +6 3 <NA> +> +> proc.time() + user system elapsed + 0.246 0.054 0.274 diff --git a/KOR.addrlink/vignettes/Example.Rnw b/KOR.addrlink/vignettes/Example.Rnw new file mode 100644 index 0000000..aef9c2e --- /dev/null +++ b/KOR.addrlink/vignettes/Example.Rnw @@ -0,0 +1,143 @@ +\documentclass{article} +%\VignetteIndexEntry{Example} +\usepackage[utf8]{inputenc} +\begin{document} +\SweaveOpts{concordance=TRUE} +\title{Using KOR.addrlink} +\author{Daniel Sch\"urmann} +\date{February 29, 2024} +\maketitle + +\section{Introduction} + +Consider a data set with semi-structured address data, e.g. street and house number as a concatenated string, +wrongly spelled street names or non-existing house numbers. This data set (referred to as df\_match) should be +mapped to a complete list of valid addresses within the given municipality. The latter data set is +called df\_ref and may include further information like coordinates of addresses or district information. +KOR.addrlink tries to solve this problem specifically for German municipalities as the package is specialized +on German address schemes. + +\section{Reference data} + +First, a complete list of reference addresses (df\_ref) is needed. An example +data.fame named "Adressen" is shown below. + +<<>>= +library(KOR.addrlink) +Adressen[c(sample(which(is.na(Adressen$HNRZ)), 4), + sample(which(!is.na(Adressen$HNRZ)), 2)),] +@ + +The columns used for the matching procedure are STRNAME (street name), HNR (house number) +and HNRZ (additional letter). This vignette illustrates the merging workflow on two sample data sets called df1 and df2. + +\section{Example 1} +df1 has address information in columns gross\_strasse and housnr. +The columns Var1 and Var2 provide non-address related information about +the individuals. Row 1183 shows that the column hausnr needs to be split +into house number and additional letter before addresses can be matched. +The function split\_number is provided for that task. + +<<>>= +df1[1180:(1183+6),] +@ + +split\_number takes hausnr and creates a data.frame with columns "Hausnummer" +(house number) and "Hausnummernzusatz" (additional letter). + +<<>>= +df1 <- cbind(df1, split_number(df1$hausnr)) +df1[1180:(1183+6),] +@ + +addrlink merges the two data sets. For both data sets, the columns referring +to steet name, house number and additional letter need to be specified +in exactly that order (parameter col\_ref and col\_match). + +<<>>= +# column hausnr is no longer needed +df1 <- within(df1, rm(hausnr)) +df1_matched <- addrlink(df_ref = Adressen, + col_ref = c("STRNAME", "HNR", "HNRZ"), + df_match = df1, + col_match = c("gross_strasse", "Hausnummer", "Hausnummernzusatz")) +@ + +The result is a list with two data.frames +\begin{itemize} +\item ret: The merged data set +\item QA: Indicators showing the match quality +\end{itemize} + +<<>>= +head(df1_matched$ret) +table(df1_matched$QA$qAddress) +@ + +qAdress states the stage within the matching procedure that yielded the match. +Out of the 10000 records, 9670 could be merged directly. 72 had a valid street +name, but an invalid house number. 157 records had (possibly) misspelled street +names and 101 records could not be matched at all. + +\section{Example 2} + +The second data set has a single column "Adresse", which includes street names +and house numbers. Thus, this column needs to be split by the function +split\_address. + +<<>>= +head(within(df2, Adresse <- trimws(Adresse))) +@ + +split\_number creates a data.frame with columns "Strasse" (street) "Hausnummer" +(house number) and "Hausnummernzusatz" (additional letter) from the column +"Adresse". + +<<>>= +df2 <- cbind(df2, split_address(df2$Adresse)) +within(df2, Adresse <- trimws(Adresse))[23:(23+6),] +@ + +Again, addrlink merges the two data sets. The parameter fuzzy\_threshold +sets the threshold for fuzzy matching of misspelled street names. A value +of 1 means no fuzzy matching and 0 means forced fuzzy matches for all records. +If a steet name could be matched, but the provided house number does not exist, addrlink +may randomly assign a valid house number to that record. A seed is always set +to ensure reproducibility. Customization is possible via the parameter seed. + +<<>>= +# column Adresse is no longer needed +df2 <- within(df2, rm(Adresse)) +df2_matched <- addrlink(df_ref = Adressen, + col_ref = c("STRNAME", "HNR", "HNRZ"), + df_match = df2, + col_match = c("Strasse", "Hausnummer", "Hausnummernzusatz"), + fuzzy_threshold = .9, seed = 1234) +@ + +<<>>= +head(df2_matched$ret) +table(df2_matched$QA$qAddress) +@ + +49 records had invalid house numbers and one record was matched by +fuzzy matching. This record can be inspected in detail. + +<<>>= +id <- which(df2_matched$QA$qAddress == 3) +df2_matched$ret[id,] +df2_matched$QA[id,] +@ + +In this case the fuzzy matching procedure was most likely correct +(St.-Georg-Str. matched SANKT-GEORG-STRA{\ss}E). + +The number of cases with correct street name and randomly assigned house +numbers is 10. + +<<>>= +sum(df2_matched$QA$qscore == 0) +@ + + +\end{document} diff --git a/README.html b/README.html new file mode 100644 index 0000000..1494e7f --- /dev/null +++ b/README.html @@ -0,0 +1,18 @@ +<h1 id="kor.addrlink">KOR.addrlink</h1> +<h3 id="beschreibung">Beschreibung</h3> +<p>Die Ausgangssituation ist eine Datenlieferung mit Adressdaten als +Textfeld. Diese sollen anhand einer vorhandenen Adressdatenbank +geokodiert werden (z.B. den UBZ zugeordnet werden)</p> +<h2 id="installation">Installation</h2> +<p>install.packages(“KOR.addrlink”)</p> +<h2 id="benutzung">Benutzung</h2> +<p>Am besten in die Vignette gucken. Da sind zwei verschiedene Beispiele +drin.</p> +<h2 id="hilfe">Hilfe</h2> +<p>d.schuermann@2718282.net</p> +<h2 id="beitragen">Beitragen</h2> +<p>Issues und pull requests sind gerne gesehen!</p> +<h2 id="autor">Autor</h2> +<p>Daniel Schürmann</p> +<h2 id="lizenz">Lizenz</h2> +<p>GPL-3</p>
\ No newline at end of file |