2016-06-09 13 views
1

누구든지이 코드를 수정하도록 도와 줄 수 있습니까? 작년에 문제없이 스크립트를 사용했지만 지금은 URL에 연결하는 데 문제가 있습니다.R 기능을 사용하여 Wunderground에서 과거 기상 데이터를 어떻게 수집합니까?

어떻게 수정합니까?

내가 원하는 것은 기상 관측소 "EKAH"(Tirstrup, Aarhus Airport, Denmark)에서 2015-12-01부터 2016-04-15까지의 데이터를 수집하고 정렬하는 것입니다. 난 그냥 새로운 하나에 대한 나쁜 URL을 자막 처리 수 있었다 생각하면서

############## 1) Run function -------------------- 

    wunder_station_daily <- function(station, date) 
    { 
     base_url <- 'https://www.wunderground.com/history/airport' 

     # Example website: https://www.wunderground.com/history/airport/EKAH/2016/06/09/DailyHistory.html?&MR=1 

     # parse date 
     m <- as.integer(format(date, '%m')) 
     d <- as.integer(format(date, '%d')) 
     y <- format(date, '%Y') 

     # compose final url 
     final_url <- paste(base_url, 
         '/', station, 
         '/', y, 
         '/', m, 
         '/', d, 
         '/DailyHistory.html?&MR=1', sep='') 

     # reading in as raw lines from the web server 
     # contains <br> tags on every other line 

     # u <- url(final_url) 
     # the_data <- readLines(u) 
     # close(u) 

     the_data <- getURL(final_url, ssl.verifypeer=0L, followlocation=1L) 

     # only keep records with more than 5 rows of data 
     if(length(the_data) > 5) 
     { 
     # remove the first and last lines 
     the_data <- the_data[-c(1, length(the_data))] 

     # remove odd numbers starting from 3 --> end 
     the_data <- the_data[-seq(3, length(the_data), by=2)] 

     # extract header and cleanup 
     the_header <- the_data[1] 
     the_header <- make.names(strsplit(the_header, ',')[[1]]) 

     # convert to CSV, without header 
     tC <- textConnection(paste(the_data, collapse='\n')) 
     the_data <- read.csv(tC, as.is=TRUE, row.names=NULL, header=FALSE, skip=1) 
     close(tC) 

     # remove the last column, created by trailing comma 
     the_data <- the_data[, -ncol(the_data)] 

     # assign column names 
     names(the_data) <- the_header 

     # convert Time column into properly encoded date time 
     the_data$Time <- as.POSIXct(strptime(the_data$Time, format='%Y-%m-%d %H:%M:%S')) 

     # remove UTC and software type columns 
     the_data$DateUTC.br. <- NULL 
     the_data$SoftwareType <- NULL 

     # sort and fix rownames 
     the_data <- the_data[order(the_data$Time), ] 
     row.names(the_data) <- 1:nrow(the_data) 

     # done 
     return(the_data) 
     } 
    } 


    ############## 2) Get data for a range of dates ------------------------------ 


    date.range <- seq.Date(from=as.Date('2015-12-01'), to=as.Date('2015-12-04'), by='1 day') 
    station <- 'EKAH' 


    # pre-allocate list 
    l <- vector(mode='list', length=length(date.range)) 

    # loop over dates, and fetch data 
    for(i in seq_along(date.range)) 
    { 
     print(paste0("Fetching data: ", date.range[i])) 
     l[[i]] <- wunder_station_daily('EKAH', date.range[i]) 
    } 

    # stack elements of list into DF, filling missing columns with NA 
    d <- ldply(l) 

답변

1

그들은 여기에 대부분의 코드의 현대화 버전입니다, 해당 URL을 조금 변경하고 (내가 정리의 99 %를 했나요) 한 : 원하는 경우가 na.strings 벡터에 ""을 추가 할 수 있습니다

#' @param station station name 
#' @param wx_date Date object or character string 
#' @param fmt if wx_date is not a Date object and the character string 
#'  is not in "%Y-%m-%d" format, then specify the format here 
#' @return data.frame of redings 
get_wx <- function(station="EKAH", wx_date=Sys.Date(), fmt="%Y-%m-%d") { 

    require(httr) 
    require(readr) 

    if (inherits(wx_date, "character")) { 
    wx_date <- as.Date(wx_date, fmt) 
    } 

    wx_base_url <- "https://www.wunderground.com/history/airport/%s/%s/DailyHistory.html" 
    wx_url <- sprintf(wx_base_url, station, format(wx_date, "%Y/%m/%d")) 

    res <- httr::GET(wx_url, query=list(MR=1, format=1)) 
    dat <- httr::content(res, as="text") 

    dat <- gsub("<br />", "", dat) 
    dat <- read.table(text=dat, sep=",", header=TRUE, 
        na.strings=c("-", "N/A", "NA"), stringsAsFactors=FALSE) 

    # saner column names 

    cols <- colnames(dat) 

    # via http://stackoverflow.com/a/22528880/1457051 
    cols <- gsub("([a-z])([A-Z])", "\\1_\\L\\2", cols, perl=TRUE) 
    cols <- sub("^(_[a-z])", "\\L\\1", cols, perl=TRUE) 
    cols <- tolower(gsub("\\.", "_", cols)) 

    readr::type_convert(setNames(dat, cols)) # more robust than type.convert() 

} 

tdy <- get_wx() 

str(tdy) 
## 'data.frame': 36 obs. of 14 variables: 
## $ time_cest   : chr "12:00 AM" "12:20 AM" "12:50 AM" "1:00 AM" ... 
## $ temperature_f  : num 51 50 48.2 47 46.4 44.6 44 44.6 44.6 44 ... 
## $ dew_point_f   : num 41 41 39.2 39 39.2 39.2 38 39.2 39.2 38 ... 
## $ humidity    : int 60 71 71 67 76 81 71 81 81 73 ... 
## $ sea_level_pressure_in: num 30.1 30.1 30.1 30.1 30.1 ... 
## $ visibility_mph  : num 28 6.2 6.2 28 6.2 6.2 7 6.2 6.2 28 ... 
## $ wind_direction  : chr "WNW" "West" "West" "West" ... 
## $ wind_speed_mph  : chr "2.3" "2.3" "2.3" "2.3" ... 
## $ gust_speed_mph  : logi NA NA NA NA NA NA ... 
## $ precipitation_in  : logi NA NA NA NA NA NA ... 
## $ events    : logi NA NA NA NA NA NA ... 
## $ conditions   : chr NA "Unknown" "Unknown" NA ... 
## $ wind_dir_degrees  : int 300 270 270 270 270 270 270 280 280 270 ... 
## $ date_utc    : POSIXct, format: "2016-06-08 22:00:00" "2016-06-08 22:20:00" ... 

a_yr_ago <- get_wx(wx_date="2015-06-09") 

.

library(purrr) 

rng <- map_df(seq(as.Date("2015-12-01"), as.Date("2015-12-04"), "1 day"), 
       function(x) { get_wx(wx_date=x) }) 
:

는 그리고 여기에 data.frame에 날짜 범위에 대한 측정 값을 얻을 수있는 다른 방법