בחנתי נתוני איחורים לפי חיתוכים של תחנות וזמנים שונים (הצלבה של יום ושעה). הפרמטרים שבחנתי היו פרקציית הרכבות בכל חתך (תחנה או זמן) שאחרו למעלה מ-6 דקות (האחוזון ה-95 של האיחורים) וכן של הרכבות שאחרו בלמעלה מ-30 דקות (האיחור המקסימלי המצדיק פיצוי לפי נתוני הרכבת.
הנה הקוד ב-R:
library("stringr")
setwd("/home/bernat/Documents/Train")
trainsDataSet<-read.csv("output_for_gzip.csv")
#dataset cleanup:
#remove duplicated rows
trainsDataSet<-trainsDataSet[!duplicated(trainsDataSet),]
#find trains late by more
them 4 hours to just one station in the line:
#find arrivals more then 4
hours late
lateArrival<-trainsDataSet[which(trainsDataSet$arrive_delay>(240)),]
#for each train more then 4
hours late, find to how many stations
#it was late
lateArrivalFreq<-table(lateArrival$train_num)
#find codes for trains late
more then 4 hours to just one station
singleStation<-names(lateArrivalFreq)[which(lateArrivalFreq==1)]
#find rows representing the
problematic arrivals
rowsToRemove<-as.integer(row.names(lateArrival)[which(lateArrival$train_num%in%singleStation)])
#remove from data set
trainsDataSet<-trainsDataSet[-(rowsToRemove),]
#since depature > 10
hours before the schduled time was
#found to be caused by
difference in dates, these were removed too
trainsDataSet<-trainsDataSet[which(trainsDataSet$depart_delay>(-600)),]
#add a colum representing
the day of the week for each train
trainsDataSet$day<-weekdays(as.Date(as.character(trainsDataSet$date)))
trainsDataSet$day<-factor(trainsDataSet$day,levels=unique(trainsDataSet$day))
#extract hour of the
journey
hourmarker<-str_locate(trainsDataSet$arrive_expected,":")[,1]
#add colum reprsenting the
hour of the journey
trainsDataSet$hour<-with(trainsDataSet,
str_sub(arrive_expected,start=hourmarker-2,
end=hourmarker-1))
trainsDataSet$hour<-as.integer(trainsDataSet$hour)
trainsDataSet$rashHour<-FALSE
trainsDataSet$rashHour[which(trainsDataSet$hour%in%c(7:10,17:19))]<-TRUE
trainsDataSet$rashHour[which(trainsDataSet$day%in%c("Friday","Saturday"))]<-FALSE
#add time column
(combination of day and hour)
trainsDataSet$time<-interaction(trainsDataSet$day,trainsDataSet$hour)
trainsDataSet$time<-factor(trainsDataSet$time,levels=unique(trainsDataSet$time))
lateFrac<-function(factor,cutoff){
#this fucntion runs over
a given factor
#and calculate for each
of its levels
#the fraction of the
observsations in the
#level in which the train
was late more
#then a given time
#params:
#factor - string
reprseniting name of colum
#in data frame
#cutoff - int represnitng
the minimal delay
#to be included in the
fraction
#returns: a data frame
with the factor levels
#and the fraction in each
level
#for which
arrival_delay>cutoff
#the data frame is sorted
in ascending order
df<-trainsDataSet[,c("arrive_delay",factor)]
colnames(df)=c("arrive_delay","factor")
fractions<-with(df,
data.frame(factor=levels(factor),
fraction=tapply(arrive_delay,factor, function(x)
length(which(x>=cutoff))/length(x))))
fractions[order(fractions$fraction),]
}
#calculate and plot for
each station fraction of trains for which the delay
#was greater then the 95th
percentile
upper<-quantile(trainsDataSet$arrive_delay,c(0.95))
fracUpper<-lateFrac("stop_name",upper)
png(filename="Late6MinsbyStation.png",bg="white",res=300,height=3000,
width=5000)
qbar<-barplot(fracUpper$fraction,xaxt="n",
main="fractions of arrival where delay >6 mins")
text(cex=1, x=qbar+0.1,
y=0, fracUpper$factor, xpd=TRUE, srt=30)
dev.off()
#calculate and plot for
each station fraction of trains for which the delay
#was greater then 30 mins
(the minimal delay for which a
#compensation can be
claimed)
reimbuseUpper<-lateFrac("stop_name",30)
with(trainsDataSet,data.frame(Stop=levels(stop_name),
Outliers=tapply(arrive_delay,stop_name, function(x)
length(which(x>30))/length(x))))
png(filename="Late30MinsStation.png",bg="white",res=300,height=3000,
width=5000)
b<-barplot(reimbuseUpper$fraction,xaxt="n",
main="fraction
of arrivals where delay>30 mins")
text(cex=1, x=b+0.1, y=0,
reimbuseUpper$factor, xpd=TRUE, srt=30)
dev.off()
#calculate delay fractions
according to time (weekday+hour)
delayByTime<-data.frame(outlier=lateFrac("time",upper),reimbuse=lateFrac("time",30))
delayByTime$rashHour<-
trainsDataSet$rashHour[match(rownames(delayByTime),trainsDataSet$time)]
#mask trains in weekend
(these where found to cause artifacts
#such as only one train in
time slot - so the dealy rate for the
#time slot was 100&)
weekend<-is.na(str_match(rownames(delayByTime),"Friday|Saturday"))
delayByTime<-delayByTime[weekend,]
#susbset the fraction
table(for visualization purposes)
delayByTime<-delayByTime[order(delayByTime$outlier.fraction),]
late<-delayByTime[which(delayByTime$outlier.fraction>0.05),]
png(filename="Late6MinsTime.png",bg="white",res=300,height=3000,
width=10000)
b<-barplot(late$outlier.fraction,xaxt="n",
col=late$rashHour,
main="fraction
of arrivals where delay>6 mins")
text(cex=1, x=b+0.1, y=0,
rownames(late), xpd=TRUE, srt=30,col="red")
legend("topleft",c("rash
hour","non rash hour"),pch=15,col=c("black","white"),cex=3)
dev.off()
delayByTime<-delayByTime[order(delayByTime$reimbuse.fraction),]
reimbuse<-delayByTime[which(delayByTime$reimbuse.fraction>1e-3),]
png(filename="Late30MinsTime.png",bg="white",res=300,height=3000,
width=10000)
b<-barplot(late$reimbuse.fraction
,xaxt="n", col=late$rashHour,
main="fraction
of arrivals where delay>30 mins")
text(cex=1, x=b+0.1, y=0,
rownames(late), xpd=TRUE, srt=30,col="red")
legend("topleft",c("rash
hour","non rash hour"),pch=15,col=c("black","white"),cex=3)
dev.off()