두 개의 데이터 세트 A와 B가 각각 8 개의 coloumns를 갖습니다. 데이터 세트 A에는 942 개의 행이 있고 데이터 세트 B에는 5079 개의 행이 있습니다. 데이터 집합 A와 데이터 집합 B를 비교하고 퍼지 매칭을 수행해야합니다. 데이터 세트 B에서 일치하는 행이 있으면 데이터 세트 A에서 "일치"를 추가 열에 표시해야합니다.for-loop를 사용하지 않고 두 데이터 세트의 행을 유사하게 매칭합니다.
저는 R에 비교적 익숙하지 않고 forloop 대신 lapply, mapply 또는 sapply로 r 코드를 최적화하는 방법을 모릅니다. 다음
내 코드 각각
##############################
# Install Necessary Packages #
##############################
#install.packages("openxlsx")
#install.packages("stringdist")
#install.packages("XLConnect")
##############################
# Load Packages #
##############################
library(openxlsx)
library(stringdist)
library(XLConnect)
cmd_newleads <- read.xlsx("Src/CMD - New Leads to Load.xlsx", sheet = "Top Leads Full Data", startRow = 1, colNames = TRUE)
cmd_newleads[c("Lead_Match","Opportunity_Match")] <- ""
c4c_leads <- read.xlsx("Src/C4C - Leads.xlsx", sheet = "Leads", startRow = 1, colNames = TRUE)
#c4c_opportunities <- read.xlsx("Src/C4C - Opportunities Data 6-24-16.xlsx", sheet = "Export 06-24-2016 04.55.46 PM", startRow = 1, colNames = TRUE)
cmd_newleads_selcols <- cmd_newleads[,c("project_name","project_address","project_city","project_state_province_region_code","project_postalcode","project_country","project_sector","project_type")]
cmd_newleads_selcols[is.na(cmd_newleads_selcols)] <- ""
#rownames(cmd_newleads_selcols)
c4cleads_selcols <- c4c_leads[,c("Lead","Address1.(Lead)","City.(Lead)","Region.(Lead)","Postal.Code.(Lead)","Country.(Lead)","Sector.(Lead)","Type.(Lead)")]
c4cleads_selcols[is.na(c4cleads_selcols)] <- ""
#cmd_c4copportunities_selcols <- c4c_opportunities[,c("project_name","project_address","project_city","project_state_province_region_code","project_postalcode","project_country","project_sector","project_type")]
rcount_cmdnewleads <- nrow(cmd_newleads)
rcount_c4cleads <- nrow(c4c_leads)
#rcount_c4copportunities <- nrow(c4c_opportunities)
for(i in 1:rcount_cmdnewleads)
{
cmd_project_name <- cmd_newleads_selcols[i,1]
cmd_project_address <- cmd_newleads_selcols[i,2]
cmd_project_city <- cmd_newleads_selcols[i,3]
cmd_project_region_code <- cmd_newleads_selcols[i,4]
cmd_project_postalcode <- cmd_newleads_selcols[i,5]
cmd_project_country <- cmd_newleads_selcols[i,6]
cmd_project_sector <- cmd_newleads_selcols[i,7]
cmd_project_type <- cmd_newleads_selcols[i,8]
for(j in 1:rcount_c4cleads)
{
c4cleads_project_name <- c4cleads_selcols[j,1]
c4cleads_project_address <- c4cleads_selcols[j,2]
c4cleads_project_city <- c4cleads_selcols[j,3]
c4cleads_project_region_code <- c4cleads_selcols[j,4]
c4cleads_project_postalcode <- c4cleads_selcols[j,5]
c4cleads_project_country <- c4cleads_selcols[j,6]
c4cleads_project_sector <- c4cleads_selcols[j,7]
c4cleads_project_type <- c4cleads_selcols[j,8]
project_percent <- stringsim(cmd_project_name,c4cleads_project_name, method="dl", p=0.1)
address_percent <- stringsim(cmd_project_address,c4cleads_project_address, method="dl", p=0.1)
city_percent <- stringsim(cmd_project_city,c4cleads_project_city, method="dl", p=0.1)
region_percent <- stringsim(cmd_project_region_code,c4cleads_project_region_code, method="dl", p=0.1)
postalcode_percent <- stringsim(cmd_project_postalcode,c4cleads_project_postalcode, method="dl", p=0.1)
country_percent <- stringsim(cmd_project_country,c4cleads_project_country, method="dl", p=0.1)
sector_percent <- stringsim(cmd_project_sector,c4cleads_project_sector, method="dl", p=0.1)
type_percent <- stringsim(cmd_project_type,c4cleads_project_type, method="dl", p=0.1)
if(project_percent > 0.833 && address_percent > 0.833 && city_percent > 0.833 && region_percent > 0.833 && postalcode_percent > 0.833 && country_percent > 0.833 && sector_percent > 0.833 && type_percent > 0.833)
{
cmd_newleads[i,51] <- c4cleads[j,c4cleads$Lead.ID]
}
else
{
cmd_newleads[i,51] <- "New Lead"
}
}
}
샘플 cmd_newleads_selcols에 대한 데이터와 c4cleads_selcols
project_name project_address project_city
1 Wynn Mystic Casino & Hotel 22 Chemical Ln Everett
2 Northpoint Complex Development East Street Cambridge
3 Northpoint Complex Development East Street Cambridge
4 Northpoint Complex Development East Street Cambridge
5 Northpoint Complex Development East Street Cambridge
6 Northpoint Complex Development East Street Cambridge
project_state_province_region_code project_postalcode
1 MA 02149
2 MA 02138
3 MA 02138
4 MA 02138
5 MA 02138
6 MA 02138
project_country project_sector project_type
1 United States of America Hospitality New Building
2 United States of America Apartments New Building
3 United States of America Apartments New Building
4 United States of America Apartments New Building
5 United States of America Apartments New Building
6 United States of America Apartments New Building
Lead Address1.(Lead) City.(Lead) Region.(Lead) Postal.Code.(Lead) Country.(Lead)
1 1 Hotel Brooklyn Bridge Park Old Fulton St & Furman St Brooklyn New York 11201 United States
2 10 Trinity Square Hotel 10 Trinity Square London # EC3P United Kingdom
3 100 Stewart 1900 1st Avenue Seattle Washington 98101 United States
4 1136 S Wabash # # # # Not assigned
5 115-129 37th Street 115-129 37th Street Union CIty New Jersey # United States
6 1418 W Addison 1418 w Addison Chicago # 60613 Not assigned
Sector.(Lead) Type.(Lead)
1 Hospitality New Building
2 Hospitality Brand Conversion
3 Hospitality New Building
4 High Rise Residential New Building
5 Developer New Building
6 High Rise Residential New Building
예, stringdist를 사용하고 있습니다. – Naveen
너무 많은 시간을 들여서 forloop을 피해야합니다. – Naveen
전체 코드가 추가되었습니다. 이해합니다. 샘플 데이터를 얻으려고합니다. – Naveen