ggbiplot

Bare fordi. ggbiplot er en lækker pakke. Det kan bare være en lidt frustrerende oplevelse at få den installeret. Der er bøvl med forskellige versioner af R, og når 60 studerende på et kursus alle prøver at installere fra github samtidig, støder man næsen mod begrænsninger i githubs api. Så. Kopier nedenstående, kør det i R, og funktionen ggbiplot er tilgængelig.

ggbiplot <- function (pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE, 
    obs.scale = 1 - scale, var.scale = scale, groups = NULL, 
    ellipse = FALSE, ellipse.prob = 0.68, labels = NULL, labels.size = 3, 
    alpha = 1, var.axes = TRUE, circle = FALSE, circle.prob = 0.69, 
    varname.size = 3, varname.adjust = 1.5, varname.abbrev = FALSE, 
    ...) 
{
    library(ggplot2)
    library(plyr)
    library(scales)
    library(grid)
    stopifnot(length(choices) == 2)
    if (inherits(pcobj, "prcomp")) {
        nobs.factor <- sqrt(nrow(pcobj$x) - 1)
        d <- pcobj$sdev
        u <- sweep(pcobj$x, 2, 1/(d * nobs.factor), FUN = "*")
        v <- pcobj$rotation
    }
    else if (inherits(pcobj, "princomp")) {
        nobs.factor <- sqrt(pcobj$n.obs)
        d <- pcobj$sdev
        u <- sweep(pcobj$scores, 2, 1/(d * nobs.factor), FUN = "*")
        v <- pcobj$loadings
    }
    else if (inherits(pcobj, "PCA")) {
        nobs.factor <- sqrt(nrow(pcobj$call$X))
        d <- unlist(sqrt(pcobj$eig)[1])
        u <- sweep(pcobj$ind$coord, 2, 1/(d * nobs.factor), FUN = "*")
        v <- sweep(pcobj$var$coord, 2, sqrt(pcobj$eig[1:ncol(pcobj$var$coord), 
            1]), FUN = "/")
    }
    else if (inherits(pcobj, "lda")) {
        nobs.factor <- sqrt(pcobj$N)
        d <- pcobj$svd
        u <- predict(pcobj)$x/nobs.factor
        v <- pcobj$scaling
        d.total <- sum(d^2)
    }
    else {
        stop("Expected a object of class prcomp, princomp, PCA, or lda")
    }
    choices <- pmin(choices, ncol(u))
    df.u <- as.data.frame(sweep(u[, choices], 2, d[choices]^obs.scale, 
        FUN = "*"))
    v <- sweep(v, 2, d^var.scale, FUN = "*")
    df.v <- as.data.frame(v[, choices])
    names(df.u) <- c("xvar", "yvar")
    names(df.v) <- names(df.u)
    if (pc.biplot) {
        df.u <- df.u * nobs.factor
    }
    r <- sqrt(qchisq(circle.prob, df = 2)) * prod(colMeans(df.u^2))^(1/4)
    v.scale <- rowSums(v^2)
    df.v <- r * df.v/sqrt(max(v.scale))
    if (obs.scale == 0) {
        u.axis.labs <- paste("standardized PC", choices, sep = "")
    }
    else {
        u.axis.labs <- paste("PC", choices, sep = "")
    }
    u.axis.labs <- paste(u.axis.labs, sprintf("(%0.1f%% explained var.)", 
        100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2)))
    if (!is.null(labels)) {
        df.u$labels <- labels
    }
    if (!is.null(groups)) {
        df.u$groups <- groups
    }
    if (varname.abbrev) {
        df.v$varname <- abbreviate(rownames(v))
    }
    else {
        df.v$varname <- rownames(v)
    }
    df.v$angle <- with(df.v, (180/pi) * atan(yvar/xvar))
    df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar))/2)
    g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) + xlab(u.axis.labs[1]) + 
        ylab(u.axis.labs[2]) + coord_equal()
    if (var.axes) {
        if (circle) {
            theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, 
                length = 50))
            circle <- data.frame(xvar = r * cos(theta), yvar = r * 
                sin(theta))
            g <- g + geom_path(data = circle, color = muted("white"), 
                size = 1/2, alpha = 1/3)
        }
        g <- g + geom_segment(data = df.v, aes(x = 0, y = 0, 
            xend = xvar, yend = yvar), arrow = arrow(length = unit(1/2, 
            "picas")), color = muted("red"))
    }
    if (!is.null(df.u$labels)) {
        if (!is.null(df.u$groups)) {
            g <- g + geom_text(aes(label = labels, color = groups), 
                size = labels.size)
        }
        else {
            g <- g + geom_text(aes(label = labels), size = labels.size)
        }
    }
    else {
        if (!is.null(df.u$groups)) {
            g <- g + geom_point(aes(color = groups), alpha = alpha)
        }
        else {
            g <- g + geom_point(alpha = alpha)
        }
    }
    if (!is.null(df.u$groups) && ellipse) {
        theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
        circle <- cbind(cos(theta), sin(theta))
        ell <- ddply(df.u, "groups", function(x) {
            if (nrow(x) <= 2) {
                return(NULL)
            }
            sigma <- var(cbind(x$xvar, x$yvar))
            mu <- c(mean(x$xvar), mean(x$yvar))
            ed <- sqrt(qchisq(ellipse.prob, df = 2))
            data.frame(sweep(circle %*% chol(sigma) * ed, 2, 
                mu, FUN = "+"), groups = x$groups[1])
        })
        names(ell)[1:2] <- c("xvar", "yvar")
        g <- g + geom_path(data = ell, aes(color = groups, group = groups))
    }
    if (var.axes) {
        g <- g + geom_text(data = df.v, aes(label = varname, 
            x = xvar, y = yvar, angle = angle, hjust = hjust), 
            color = "darkred", size = varname.size)
    }
    return(g)
}
Udgivet i R

Waffle charts

A rather popular chart type. Not really my favorite, but I can see how it makes things easier to understand for people who are not used to read and understand charts. The reason for my less than favourable view on waffle charts are probably linked to its overuse in meaningless infographics.

A waffle chart is a grid with squares/cells/icons/whatever, where each cell represents a number of something.

Lets make an example:

library(ggplot2)
library(waffle)
vec <- c(`Category 1 (10)`= 10 , `Category 2 (20)`= 20,
              `Category 3 (25)`= 24, `Category 4 (16)` = 16)

waffle(vec/2, rows=3, size=0.1, 
       colors=c("#c7d4b6", "#a3aabd", "#a0d0de", "#97b5cf"), 
       title="Four different categories of something", 
       xlab="1 square = 2 somethings")

plot of chunk unnamed-chunk-2
One annoyance: waffle wants you to spell colours wrong.

waffle takes a named vector of values, rows sets the number of rows of blocks. Default is 10.

One standard way, is to show a 10×10 grid, where each cell represents 1% of the total:

waffle(vec/sum(vec)*100)

plot of chunk unnamed-chunk-3

Bloody annoying – waffle rounds the values of the vector, leading to only 98 squares. So you have to manipulate your vector to get to 100. Well, actually it is probably a minor annoyance.

What if you want something else than coloured squares?

The arguments “use_glyph” and “glyph_size” makes that possible.
First, we’ll need the library extrafont

library(extrafont)

We’ll also need to have the “awesomefonts” installed. It can be downloaded from:

http://maxcdn.bootstrapcdn.com/font-awesome/4.3.0/fonts/fontawesome-webfont.ttf

This should be easier if you are on a desktop machine. As I’m running this through my own installation of RStudio on a remote server, it was a bit more difficult.

I needed to place the “fontawesome.ttf” file in the “/usr/share/fonts/truetype/fontawesome” directory.

Then, running R as superuser on the commandline, I imported the extrafont library, and then ran “font_import()”.

But then it worked!

There is now a long list of 593 different icons, that can be used. If you want a list, just run fa_list().

And now, we can make a waffle chart with the glyph of our choice.

waffle(vec/2, rows=4, use_glyph = "wifi")
## Warning: Removed 1 rows containing missing values (geom_text).

plot of chunk unnamed-chunk-5

We can change the colours:

library(RColorBrewer)
waffle(vec/2, rows=4, use_glyph = "wifi", colors=brewer.pal(4,"Set1"))
## Warning: Removed 1 rows containing missing values (geom_text).

plot of chunk unnamed-chunk-6

Adjust the size

waffle(vec/2, rows=4, use_glyph = "wifi", colors=brewer.pal(6,"Set1"), glyph_size=5)
## Warning: Removed 1 rows containing missing values (geom_text).

plot of chunk unnamed-chunk-7
But that does not look very good.

waffle is based on ggplot, so we have access to the full range of functionality. But not all of them are going to look good in this context:

waffle(vec/2, rows=4, use_glyph = "wifi", colors=brewer.pal(4,"Set1")) +
  geom_label(label="42", size = 3)
## Warning: Removed 1 rows containing missing values (geom_text).

plot of chunk unnamed-chunk-8

If we install a font that supports it, we even get access to the large number of UTF-8 glyphs. Here is a favorite of mine:

waffle(vec/2, rows=4, colors=brewer.pal(4,"Set1")) +
  geom_label(label=sprintf("\U1F427"), size = 8)

plot of chunk unnamed-chunk-9

Which of course requires you to have a font on your computer that supports penguins.

Here is one:
http://users.teilar.gr/~g1951d/Symbola.zip

Udgivet i R

Project Euler 39

Project Euler 39

We’re looking at Pythagorean triplets, that is equations where a, b and c are integers, and:

a2 + b2 = c2

The triangle defined by a,b,c has a perimeter.

The triplet 20,48,52 fulfills the equation, 202 + 482 = 522. And the perimeter of the triangle is 20 + 48 + 52 = 120

Which perimeter p, smaller than 1000, has the most solutions?

So, we have two equations:

a2 + b2 = c2

p = a + b + c

We can write

c = p – a – b

And substitute that into the first equation:

a2 + b2 = (p – a -b)2

Expanding the paranthesis:

a2 + b2 = p2 – ap – bp – ap + a2 + ab – bp + ab + b2

Cancelling:

0 = p2 – 2ap – 2bp + 2ab

Isolating b:

0 = p2 – 2ap – b(2p – 2a)

b(2p – 2a) = p2 – 2ap

b = (p2 – 2ap)/(2p – 2a)

So. For a given value of p, we can run through all possible values of a and get b. If b is integer, we have a solution that satisfies the constraints.

The smallest value of a we need to check is 1. But what is the largest value of a for a given value of p?

We can see from the pythagorean equation, that a =< b < c. a might be larger than b, but we can then just switch a and b. So it holds. What follows from that, is that a =< p/3.

What else? If a and b are both even, a2 and b2 are also even, then c2 is even, and then c is even, and therefore p = a + b + c is also even.

If a and b are both uneven, a2 and b2 are also uneven, and c2 is then even. c is then even. And therefore p = a + b + c must be even.

If either a or b are uneven, either a2 or b2 is uneven. Then c2 is uneven, and c is then uneven. Therefore p = a + b + c must be even.

So. I only need to check even values of p. That halves the number of values to check.

Allright, time to write some code:

current_best_number_of_solutions <- 0

for(p in seq(2,1000,by=2)){
  solutions_for_current_p <- 0
  for(a in 1:ceiling(p/3)){
    if(!(p**2-2*a*p)%%(2*p-2*a)){
      solutions_for_current_p <- solutions_for_current_p + 1
    }
  }
  if(solutions_for_current_p > current_best_number_of_solutions){
    current_best_p <- p
    current_best_number_of_solutions <- solutions_for_current_p
   }
}

answer <- current_best_p

current_best_number_of_solutions is initialized to 0.

For every p from 2 to 1000, in steps of 2 (only checking even values of p), I set the number of solutions_for_current_p to 0.

For every value a from 1 to p/3 – rounded to to an integer: If !(p2-2*a*p)%%(2*p-2*a) is true, that is, if the remainder of (p2-2*a*p)/(2*p-2*a) is 0, I increment the solutions_for_current_p.

After running through all possible values of a for the value of p we have reached in the for-loop:

If the number of solutions for this value of p is larger, than the previous current_best_number_of_solutions, we have found a value of p that has a higher number of solutions than any previous value of p we have examined. In that case, set the current_best_p to the current value of p. And the current_best_number_of_solutions to the number of solutions we have found for the value of p.

If not, dont change anything, reset solutions_for_current_p and check a new value of p.

Migrants visualized

First of all, this is in no way a statement on the immigration crisis in Europe. I do have opinions. But it is more a reaction or reflection on three maps I saw on this page.

Danish televison channel TV2 is illustrating the number of refugees or perhaps rather immigrants received in EU-memberstates in the period 2015 to 2017. This is the map showing the number of immigrants to EU in 2015

Note Germany. Germany welcomed the absolutely highest number of immigrants. What piqued my interest though, is that this might be a good illustration of the numbers, it is not really the relevant comparisons. Yes, Germany welcomed more refugees than Denmark did. But Germany is a rather larger country than Denmark. For a given value of “fair”, it is only fair that Germany takes more refugees than smaller countries.

A more relevant comparison might be the number of refugees compared to population. Or area. Sweden saw (at that time) no problems with welcoming a huge number of migrants, because, as they said, there are a lot of un-populated space in Sweden, plenty of room for everyone! Or perhaps GDP is a better way. Richer countries should shoulder a larger part of the challenge than poorer countries.

I’m not concerned here with what is fair. What concerns me is that the graphic is misleading. Lets make an attempt at fixing that. Or at least present a slightly different perspective on the data.

I’ll try to illustrate the number of migrants as a proportion of population in the different countries. The data is “stolen” directly from the news-channel. They have it from UNHCR, Eurostat and the European Parlament.

The first step will be to get the data.

url <- "http://nyheder.tv2.dk/udland/2018-06-28-se-kortet-saa-mange-asylansoegere-har-de-forskellige-eu-lande-taget"
data <- readLines(url)

By inspection, I can see that the relevant data is in these three lines:

dat.2015 <- data[460]
dat.2016 <- data[409]
dat.2017 <- data[358]

There is a small problem. Strange danish characters are encoded. Lets fix that:

library(stringr)
data <- str_replace_all(data,"\\\\u00d8", "Ø")
data <- str_replace_all(data,"\\\\u00e6", "æ")
data <- str_replace_all(data,"\\\\u00f8", "ø")

And load it into the separate variables again:

dat.2015 <- data[460]
dat.2016 <- data[409]
dat.2017 <- data[358]

Next, I need to get the names of the countries, and the number of migrants received in each country.

The data I’m after looks like this:

\“Bulgarien\”:{\“valueheat\”:20365,\“valuecolored\”:\“none\”,\“description\”:\“\”},

And the regular expression picking that out of the data ought to be:

‘\“(\p{L}+)\”:{\“valueheat\”:(\d+|\“\”),’

For some reason that is not working. I probably should try to figure that out, but I’m on vacation, and would rather drink cold white wine that dig too deep into the weirdness that is regular expressions in R.

Instead I’m going to use this simpler pattern:

pattern <- '(\")(\\w+)(.*?)(\\d+|\\\"\\\")'

And then fix problems later.

Extracting the data:

dat.2015 <- unlist(str_extract_all(dat.2015, pattern))
dat.2016 <- unlist(str_extract_all(dat.2016, pattern))
dat.2017 <- unlist(str_extract_all(dat.2017, pattern))

Inspecting the data, reveals that the interesting parts are lines 23 to 111.

dat.2015 <- dat.2015[23:111]
dat.2016 <- dat.2016[23:111]
dat.2017 <- dat.2017[23:111]

An example of two lines:

dat.2015[11:12]
## [1] "\"Danmark\":{\"valueheat\":20935"              
## [2] "\"valuecolored\":\"none\",\"description\":\"\""

First, lets get rid of the second line. There are one of those for each country.

Secondly, I’ll remove the first to characters in the first line. \“ to be precise.

And thirdly, I’ll split that line on \”:{\“valueheat\”:

Using the nice pipes makes that easy:

library(purrr)
dat.2015 <- dat.2015 %>%
  discard(str_detect, "valuecolored") %>%
  substring(2) %>%
  strsplit(split="\":{\"valueheat\":",fixed=TRUE)

That should give me a list where each element is a vector with two elements:

dat.2015[[8]]
## [1] "Finland" "32345"

Neat! Lets do that with the other years:

dat.2016 <- dat.2016 %>%
  discard(str_detect, "valuecolored") %>%
  substring(2) %>%
  strsplit(split="\":{\"valueheat\":",fixed=TRUE)

dat.2017 <- dat.2017 %>%
  discard(str_detect, "valuecolored") %>%
  substring(2) %>%
  strsplit(split="\":{\"valueheat\":",fixed=TRUE)

Now, lets get these data into some dataframes. First I’m unlisting the data, then I pour it into a matrix to get the right shape. And then I’m converting the matrices to dataframes:

dat.2017 <- data.frame(matrix(unlist(dat.2017), ncol=2, byrow=T), stringsAsFactors = F)
dat.2016 <- data.frame(matrix(unlist(dat.2016), ncol=2, byrow=T), stringsAsFactors = F)
dat.2015 <- data.frame(matrix(unlist(dat.2015), ncol=2, byrow=T), stringsAsFactors = F)

There is a slight problem. Albania was the first country in the list. And the structure of the raw data was a bit different.

dat.2015[1,1]
## [1] "regions\":{\"Albanien"

Lets fix that:

dat.2015[1,1] <- "Albanien"
dat.2016[1,1] <- "Albanien"
dat.2017[1,1] <- "Albanien"

And let me just change the column names:

colnames(dat.2015) <- c("Land", "2015")
colnames(dat.2016) <- c("Land", "2016")
colnames(dat.2017) <- c("Land", "2017")

“Land” is danish for “country”.

I’m going to need just one dataframe. I get that by joining the three dataframes:

library(dplyr)
total <- left_join(dat.2015, dat.2016, by="Land")
total <- left_join(total, dat.2017, by="Land")

The numbers are saved as characters. Converting them to numeric:

total$`2015` <- as.numeric(total$`2015`)
## Warning: NAs introduced by coercion
total$`2016` <- as.numeric(total$`2016`)
## Warning: NAs introduced by coercion
total$`2017` <- as.numeric(total$`2017`)
## Warning: NAs introduced by coercion

That introduced some NAs. Countries where there are no data.

Inspecting the data, I can see that there are data for all three years for some countries. For other countries, there are no data at all. The function complete.cases() will return true for a row without NAs.

Using that to get rid of countries where we don’t have complete data:

total <- total[complete.cases(total),]

Next is getting some figures for the populations.

The relevant page on Wikipedia is:

url <- 'https://da.wikipedia.org/wiki/Verdens_landes_befolkningsst%C3%B8rrelser'

Getting that:

library(XML)
library(httr)
r <- GET(url)
doc <- readHTMLTable(
  doc=content(r, "text"))
tabellen <- doc$'NULL'

colnames(tabellen) <-  apply(tabellen[1,],2,as.character)

I’m only interested in the country name, and the population:

tabellen <- tabellen %>%
  select(`Land (eller territorium)`, Population)

Renaming the colums:

colnames(tabellen) <- c("Land", "Population")
tabellen$Population <- as.character(tabellen$Population)
tabellen$Population <- as.numeric(str_remove_all(tabellen$Population, fixed(".")))
## Warning: NAs introduced by coercion

And while I’m at it, the second line gets rid of the factors, and the third removes the thousand separators (“.”)

Now I can join the dataframe containing population figures, with the dataframe containing countries and number of migrants:

total <- left_join(total, tabellen, by="Land")
## Warning: Column `Land` joining character vector and factor, coercing into
## character vector

There are three smaller problems. Cyprys, France and Ireland. The problem is that the country name I get from Wikipedia contains a note. I might be able to get rid of that by code. I’m going to do it manually.

total[10,5] <- 4635400
total[7,5] <- 67286000
total[3,5] <- 847000

Now I have a nice dataframe with the name of the countries (in danish), the numbe of migrants received in 2015, 2016 and 2017, and the population in 2018.

Now it is time to look at some maps.

library(ggplot2)
library(rworldmap)

I am going to match the countries in my dataframe, with the countries I get from the map data. That requires that I have the english names for the countries in my dataframe.

This is the translation:

enland <- c("Belgium", "Bulgaria", "Cyprus", "Denmark", "Estonia", "Finland", "France", "Greece", "Netherlands", "Ireland",
          "Italy", "Croatia", "Latvia", "Lithuania", "Luxembourg", "Malta", "Poland", "Portugal", "Romania", "Slovakia",
          "Slovenia", "Spain", "United Kingdom", "Sweden", "Czech Rep.", "Germany", "Hungary", "Austria"  )

Getting that into the data frame:

total$enland <- enland

Next, lets get the map:

worldmap <- getMap()

That retrieves data for the entire world. I’m only interested in EU:

EU <- worldmap[which(worldmap$NAME %in% enland),]
EU <- map_data(EU)

The first line extracts the part of the world map that has names in the list of countries that I have data for.
map_data() converts that into a nice structure that is suitable for entering into ggplot.

Next step is calculating the number of migrants received in each country as a proportion of that countrys population:

total <- total %>%
  mutate(`2015` = `2015`/Population*100, `2016` = `2016`/Population*100, `2017`=`2017`/Population*100)

I’m mutating the columns 2015-2017 by dividing by population. And multiplying by 100 to get percentages.

The almost final step, is to join my migrant-proportions with the map data:

total <- left_join(total,EU, by=c("enland"="region") )

The map data does not call the countries for countries. Rather their names are saved in the variable “region”.

And now the final step. I’m going to need the data on tidy form. So I’m loading tidyr.

Then I pass the data frame to select(), where I pick out the variables I need. Long(itude), lat(itude), 2015, 2016 and 2017, and the name of the country.
That is passed to gather(), where I make a new row for each year, with the proportions in the new variabels year and prop.

All that is passed to ggplot, and a layer where the polygons describing the countries are plotted. They are with a colour matching the proportions. And grouped by “group”. This is important. Grouping by country name gives weird results. I’ll get back to that. color=“white” plots the lines in the polygons in white.

Finally, I facet the data on year.

library(tidyr)
total %>%
  select(long,lat,`2015`,`2016`,`2017`, group) %>%
  gather(year, prop,  `2015`:`2017`)%>%
  ggplot() +
    geom_polygon(aes(long, lat, fill = prop, group = group), color = "white") +
    theme_void() +
    facet_wrap(~year, ncol=2)

plot of chunk unnamed-chunk-32

Thats it!
And now the picture is slightly different. What is interesting is that Germany still takes a higher proportion of the migrants than other countries. But in 2015, they didn’t. That was the year when the german chancellor Angela Merkel said the famous words “Wir schaffen das”, We’ll manage. But also the year when Hungary and sweden welcomed migrants in numbers equalling 1.79% and 1.65% of their population respectively. You can compare that with the fact that Germany the same year received migrants equalling 0.58% of their population.

A cynic might claim that it is no surprise that Sweden and Hungary closed their borders late in 2015.

Any way, that is a different subject. I just think that these three maps are slightly more informative than what TV2 provided.

Also, I promised to get back to the group thingy.

Making the same plot, but grouping on country names:

total %>%
  select(long,lat,`2015`,`2016`,`2017`, group, enland) %>%
  gather(year, prop,  `2015`:`2017`)%>%
  ggplot() +
    geom_polygon(aes(long, lat, fill = prop, group = enland), color = "white") +
    theme_void()

plot of chunk unnamed-chunk-33

What happens is that the polygons describing Italy are grouped in a way that connects the parts describing sicily to the northern part of Italy. That looks weird. The same happens with Sardinia.

Finally. I have not been very consistent in my use of words. I have used “received” and “welcomed” interchangeably. Hungary and Denmark has not been very welcoming. But we are talking about real humans here, and welcoming simply sounds nicer than received. Complicating the situation was the fact that a lot of the arrivals were not actually what we would normally call refugees. At least not refugees from war. So I have also not been consistent in the use of “migrant” vs “refugee”. That is not really my point. The point is that we should always think about how these kinds of numbers are presented.

Udgivet i R

An engineers’ dream come true

Project Euler, problem 263 – An engineers’ dream come true

This is, at the time of coding, the highest numbered Project Euler problem I’ve tried to tackle. With a difficulty rating of 75% it is also the most difficult. At least on paper. But An engineers’ dream come true? How can I not, as an engineer, not try to solve it?

We har looking for an n, for which it holds that:

  • n-9 and n-3 must be consecutive primes
  • n-3 and n+3 must also be consecutive primes
  • n+3 and n+9 must also be consecutive primes.

These are primepairs that are “sexy”, that is that have differences of 6.

Also, n, n-8, n-4, n+4 and n+8 must be practical numbers, that is numbers where the numbes 1:n can be written as sums of distinct divisors of n.

So if a number n gives sexy prime pairs, and are very practical – that is what an engineer dreams of – hence the paradise.

The fastest way seems to be to generate a list of primes, sieve those out that conforms to the requirements for consecutive primes, and then test those numbers for practicality.

Lets get started!

The trusty numbers library provides the primes, up to 1000000. Then for each of those primes, return the n-value, if the differences between the sets of primes, are 6.

library(numbers)

primenumbers <- Primes(1000000)
primecandidates <- numeric()
for(i in 1:(length(primenumbers)-4)){
if((primenumbers[i+1] - primenumbers[i] == 6) & (primenumbers[i+2] - primenumbers[i+1]== 6) & (primenumbers[i+3] - primenumbers[i+2]== 6)){
  primcandidates <- c(primecandidates,(primenumbers[i]+9))
  }
}

What we get from this, is not primenumbers, but values of n, that gives the consecutive primes we need.

Now I have a list of candidate n’s based on the requirements for primes. Next I need to check for practicality.

First I tried a naive way. Generate the list of numbers that I need to sum to using distinct divisors, for a given x.
Then get the divisors of x. I dont need to check if I can sum to the numbers that are themselves divisors, so excluding them leaves me with at slightly smaller set. Then I get all the ways I can take two divisors from the set of divisors. Sum them, and exclude them from the list of numbers. I continue until I have taken all the possible combinations of 2, 3, 4 etc divisors, based on how many there are. If there are no numbers left in the vector of numbers that I need to be able to sum to, I was able to express all those numbers as sums of distinct divisors. And then x was a practical number.

practical <- function(x){
  test <- 1:x
  divs <- divisors(x)
  test <- setdiff(test,divs)
  for(i in 2:length(divs)){
    test <- setdiff(test,combn(divs,i,sum))
  }
  !as.logical(length(test))
}

Two problems. One that can be fixed. I continue to generate combinations of divisors and summing them, even if I have already found ways to sum all the numbers. The second problem is more serious. When I have to test a number with really many divisors – it takes a long time. Also, handling a vector containing all numbers 1:1000000 takes a lot of time.

I need a faster way of checking for practicality.

Wikipedia to the rescue. There exists a way of checking. I have no idea why it works. But it does.

For a number x, larger than 1, and where the first primefactor is 2. All primefactors are ordered. Taking each primefactor, that has to be smaller than or equal to the sum of the divisors of the product of all the smaller primefactors. Plus one. Oh! And that sum – if 3 is a primefactor twice, that is if 32 is a factor, I should square 3 in the product.

That sounds simple.

For a number x, get the primefactors. Use table to get the counts of the primefactors, ie that 3 is there twice. Those are the values of the result from the table function. The names of the table function are the primefactors.

For each factor from number 2 to the end of the number of factors, get the names of the primefactors from number 1 to just before that factor we are looking at (as numeric). Exponentiate with the values from the table – that is how many times a primefactor is a primefactor. Generate the product, get the divisors of that product, sum them, and add 1. If the factor we were looking at is larger that that, we have determined that x is not practical – and can return FALSE. If x gets through that process, it is practial.

I need to handle the case where there is only one primefactor – 2. Those numbers are practial, but the way I have done the check breaks when there is only one primefactor. Its simple enough, just check if there is only one distinct primefactor, and return TRUE in that case.

practical <- function(x){
  all_factors <- factors(x)
  all_factors <- table(all_factors)
  n_factors <- length(all_factors)
  res <- TRUE
    if(n_factors ==1){
    return(TRUE)
    break()
  }

  for(i in 2:n_factors){
    if((as.numeric(names(all_factors)[i]))>(sum(divisors(prod(as.numeric(names(all_factors)[1:i-1])**as.numeric(all_factors)[1:i-1])))+1)){
      return(FALSE)
      break()
      }
  }
  return(TRUE)
}

So, quite a bit faster!

Now I can take my candidate n’s based on the requirements for primepairs, and just keep the n’s that are themselves practical. And where n-8, n-4, n+4 and n+8 are also practial:

eng_numbers <- primecandidates %>%
  keep(function(x) practical(x-8)) %>%
  keep(function(x) practical(x-4)) %>%
  keep(function(x) practical(x)) %>%
  keep(function(x) practical(x+4)) %>%
  keep(function(x) practical(x+8))

eng_numbers
## numeric(0)

OK. There was no n’s in this set.

This is kinda problem. The n we are looking for are actually pretty large. I know this, because this writeup is written after I found the solution. So it is not because the code is flawed.

Nu har vi så den udfordring, at vi skal have fat i ret høje tal.

primenumbers <- primes(500000000)
primecandidates <- numeric()
for(i in 1:(length(primenumbers)-4)){
if((primenumbers[i+1] - primenumbers[i] == 6) & (primenumbers[i+2] - primenumbers[i+1]== 6) & (primenumbers[i+3] - primenumbers[i+2]== 6)){
  primecandidates <- c(primecandidates,(primenumbers[i]+9))
  }
}
eng_numbers <- primecandidates %>%
  keep(function(x) practical(x-8)) %>%
  keep(function(x) practical(x-4)) %>%
  keep(function(x) practical(x))   %>%
  keep(function(x) practical(x+4)) %>%
  keep(function(x) practical(x+8))

length(eng_numbers)
## [1] 3

That gives us three. We need four.

Now – I could just test larger and larger sets of primes. I run into memory problems when I try that. Then I could write some code, that generates primenumbers between some two numbers, and increse those numbers until I get number four.

I have not. Instead I just tried again and again, until I found number 4. We need to have an upper limit for primes that are some four times larger than the one I just used. Anyway, I found number four. And got the green tickmark.

Lesson learned

Sometimes you really need to look into the math. This would have been impossible if I had not found a quicker way to check for practicality.

Also, I should refactor this code. The check for practicality could certainly be optimised a bit.

And – I should probably check for primality, rather than generating the list of primes.

Project Euler 84

Monopoly – Project Euler problem 84

This is basically a question of building a Monopoly simulator.

We begin at square GO. Roll two six-sided dice, and advance.

If we land on the “Go to jail” (G2J) square, we go to the JAIL square.

If we land on a CC square, we draw a card from the “community chest” stack of cards. There are 16 cards there, only two changes the position of the player: “Advance to GO” and “Go to Jail”

If we land on a CH square, we draw a card from the Chance-stack. There are 16 cards here, only ten changes the position:

  • Advance to GO
  • Go to JAIL
  • Go to C1
  • Go to E3
  • Go to H2
  • Go to R1
  • Go to next R (railway company)
  • Go to next R
  • Go to next U (utility company)
  • Go back 3 squares.

There are three CH squares, and three CC.

A player being sent to jail, is assumed to buy himself out immediately. And if a player rolls three consecutive doubles, he does not advance the result of the third roll, but goes directly to jail.

The two stacks of cards are shuffled at the beginning.

Find the probability of landing on a given square, and return the three squares that are most likely to land on.

Numbering the squares from 00 to 39, the answer is the concatenated string representing these three squares. For 6 sided dice, the most likely squares to end on after a roll of the dice, are squares 10, 24 and 00. The answer is 102400.

I am going to need some dice:

dice <- function() sample(1:6, 2, replace=TRUE)

I’m gonna need a community chest function. And I am going to take a chance. That is, I am going to asume that the whole “shuffle the deck at the start, and place drawn card at bottom” is going to be equivalent to just picking a random card:

pool <- rep(0,14)
pool <- c(pool,1,11)
cc <- function(cs){
  res <- sample(pool,1)
  ifelse(res==0,cs,res)
}

It takes the current square, builds a vector containing 14 repetitions of 0, one instance of “1” corresponding to GO, and one of “11”, corresponding to jail. Then it returns one of the values.

And a chance function. Again, I am going to make the same assumption as with the community chest function:

ch_pool <- rep(0,6)
ch_pool <- c(ch_pool,1:10)
ch <- function(cs){
 chance <- sample(ch_pool,1) 
 if(chance==0) res <- cs
 if(chance==1) res <- 1
 if(chance==2) res <- 11
 if(chance==3) res <- 12
 if(chance==4) res <- 25
 if(chance==5) res <- 40
 if(chance==6) res <- 6
 if(chance==7&cs==8) res <- 16
 if(chance==7&cs==23) res <- 26
 if(chance==7&cs==37) res <- 6
 if(chance==8&cs==8) res <- 16
 if(chance==8&cs==23) res <- 26
 if(chance==8&cs==37) res <- 6
 if(chance==9&cs==8) res <- 13
 if(chance==9&cs==23) res <- 29
 if(chance==9&cs==37) res <- 13
 if(chance==10) res <- cs-3
 return(res)
}

This is basically the same as the community function. Just a bit longer.

I am also going to need a structure for the playing board:

hit <- numeric(40)

A third chance I’m taking. That the part about hitting three doubles in a row, and then going to jail, will also disappear if I just run this long enough:

curr_square <- 1

for(i in 1:10000000){
  curr_square <- ((curr_square + sum(dice())-1)%%40 + 1) 
  if(curr_square == 31) curr_square <- 11
  if(curr_square %in% c(2,18,34)) curr_square <- cc(curr_square)
  if(curr_square %in% c(8,23,37)) curr_square <- ch(curr_square)
  hit[curr_square] <- hit[curr_square] + 1
}

I begin at square 1. And then I am going to repeat the following 1.000.000 times:
Roll the dice, add the result to the current square. If the result is more than 40, do some simple math to continue from square 1.

If the square I’m landing on is square 31, set new square to 11 – that is the jail.

If it is 2, 18 or 34, run the cc – community chest function. That will return the square that we land on after drawing a card from that stack.

If it is 8, 23 or 37, we have landed on a chance square. Run the ch – chance function. That will return the square that we land on after drawing a card from that stack.

That should give me the frequencies with which the player have landed on each square.

Now I can rank the results:

freq <- rank(hit)

That gives me the rank. Number 40 is the square that was hit most times.

I need to translate that to the numbering of the squares that is used for the result.

squares <- c("00","01","02","03","04","05","06","07","08","09",10:39)
paste(squares[freq %in% c(40,39,38)], collapse="")
## [1] "001024"

pasting together the squares hit most, second most, and third-most times. That is fortunately the result given in the problem.

Now lets try with a new set of dice:

dice <- function() sample(1:4, 2, replace=TRUE)

And the same code:

hit <- numeric(40)
curr_square <- 1

for(i in 1:10000000){
  curr_square <- ((curr_square + sum(dice())-1)%%40 + 1) 
  if(curr_square == 31){ curr_square <- 11}
  if(curr_square %in% c(2,18,34)) curr_square <- cc(curr_square)
  if(curr_square %in% c(8,23,37)) curr_square <- ch(curr_square)
  hit[curr_square] <- hit[curr_square] + 1
}

freq <- rank(hit)

answer <- paste(squares[freq %in% c(40,39,38)], collapse="")

And we are rewarded with the nice green tickmark.

This was not as difficult as I thought. But maybe it would be wise to look at the forum for the problem, and study some of the non-brute force solutions.

Oh – another lesson learned. In the code above, I run through 10.000.000 throw with the dice. I actually wanted to throw the dice 1.000.000 times. Always count your zeroes!

Also – 100.000 thow is enough to get the right result.

Project Euler 179

Project Euler 179

14 has four divisors: 1, 2, 7 and 14.

14+1, ie. 15, also has four divisors: 1, 3, 5 and 15.

How many integers, n, where 1 < n < 107, has the same number of divisors as n + 1?

That is surprisinly simple. Start by figuring out how many divisors all the numbers have.

n <- 10000000
t <- integer(n)



for(s in 1:n){
  t[seq(s,n, by=s)] <- t[seq(s,n, by=s)] + 1
}

First I make a vector with the length 107 containing only zeroes.

Element 1 coresponds to n = 1, element 47 to n = 47.

Every single element in that vector has 1 as a divisor.

Every other element has 2 as a divisor.

Every third element has 3 as a divisor, and so on.

So for every element, in the sequence 2 to 107, in steps of 2, I add 1.

The same for every element in the sequence 3 to 107 in steps of 3.

And so on.

That takes a loooong time to do.

Now I have a vector t, where the value in t[n] is the number of divisors in n.

I now need to compare them.

If I have two identical vectors q and r. And add a 0 to the beginning of one of them, and a 0 at the end of the other. I will have two vectors, where element i+1 in the first is the same as element i in the second (or the other way around). So when I compare q[47] with r[47], I actually compare q[47] with q[48]. That is the comparison I need. When these to values are identicial, I have an iteger with the same number of divisors as the following integer.

Apoligies for the not so fluent prose. It’s pretty warm and I’m rather tired.

Anyway:

q <- c(0,t)
r <- c(t,0)

I now just need to compare them, and sum the result of that comparison:

answer <- sum(q==r)

Lesson learned

Just because a problem has a high number, doesn’t mean that it is particularly difficult.

Project Euler – 58

Project Euler 58 – Spiral Primes

All right. Lets make a square, by jotting down all numbers from 1 in a spiral like this:

matrix(c(37, 36, 35, 34, 33, 32, 31,38, 17, 16, 15, 14, 13, 30,39, 18,  5,  4,  3, 12, 29,40, 19,  6,  1,  2, 11, 28,41, 20,  7,  8,  9, 10, 27,42, 21, 22, 23, 24, 25, 26,43, 44, 45, 46, 47, 48, 49),7,7,byrow=TRUE)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]   37   36   35   34   33   32   31
## [2,]   38   17   16   15   14   13   30
## [3,]   39   18    5    4    3   12   29
## [4,]   40   19    6    1    2   11   28
## [5,]   41   20    7    8    9   10   27
## [6,]   42   21   22   23   24   25   26
## [7,]   43   44   45   46   47   48   49

We’re going anti-clockwise.

What Project Euler thinks is interesting, is that 8 out of the 13 numbers lying on the two diagonals, are prime.

That gives a primal ratio of 8/13, or about 62%

We can continue adding layers to the spiral, and make squares with increasing side length.

We are now tasked with finding the side length of a the square spiral for which the ratio of primes along the two diagonals falls below 10% (for the first time).

The task boils down to: Figure out what numbers are on the diagonals of a square with a given side length. What is the proportion of primes in that set of numbers.

The final number in the spiral of a nxn square is, nxn. That number is in the south-eastern corner of the square.

The number in the south-wester corner is n2 – (n-1). The number in the north-wester corner is n2 – 2(n-1). And the one in the north-eastern corner is n2 – 3(n-1).

The numbers on the diagonals are all corner-numbers in a square. If I want all the numbers on the diagonals in the 7×7 square above, I calculate the corner-numbers for n = 3, n = 5 and n = 7, and collect them all in a vector, along with the center piece, 1.

I’m going to define a numbers-vector containing the numbers on the diagonal, and add 1 to it.

numbers <- numeric()
numbers <- c(1, numbers)

Having that vector, I need to calculate the fraction of the elements in it, that are prime. The numbers library is handy for that:

library(numbers)

prime_fraction <- function(v){
  sum(isPrime(v))/length(v)
}

The function takes a vector as input, runs isPrime() on it. That function returns a logical vector, with TRUE where the element is prime. The sum of that vector is the number of primes in the input vector. Dividing that with the number of elements in the vector, length(), returns the fraction of primes in the input vector.

Now all that remains is adding diagonal numbers to the vector, check the fraction of primes, and continue until that fraction is below 0.1. I’m going to run the first iteration by hand. The fraction of primes in the initial numbers vector is 0, and that is of course less than 0.1, but not the answer I’m looking for.

i <- 3
numbers <- c(numbers, i*i, i*i-(i-1), i*i-2*(i-1), (i*i-3*(i-1)))
i <- i + 2
while(prime_fraction(numbers)>0.1){
  numbers <- c(numbers, i*i, i*i-(i-1), i*i-2*(i-1), i*i-3*(i-1))
  i <- i + 2
}
answer <- i - 2

OK. That gives me the correct answer. One problem: It takes forever. Well, not forever. But too long. The problem is the isPrime() function. A small test tells me that checking the primality of a vector containing 5385 elements, takes about 70 milliseconds. That is pretty quick. But when I do that several thousand times, it adds up.
Also I’m destined to spend eternity in the first circle of R-hell for growing the vector in perhaps the most inefficient way.

Is there a simpler way?

What I’m really looking for is two numbers. The number of primes in the diagonals. And the total number of numbers in the diagonals.

Each diagonal in an nxn square contains n numbers. The center is counted twice, so the total number is given by 2*n – 1.

What about the number of primes? If I instead of adding the four corners to the numbers vector, and testing that for primes, just test the four numbers, and add the number of primes among them to a counter. That should be quicker, as I’m only checking each number for primality once. What should make it even faster, is that I only have to check three numbers. The south-eastern number is always a square, so definitely not prime.

Lets try again. As before, I’m priming the pump, and beginning with the 5×5 square:

i <- 5
primes <- 3

while(primes/(2*i - 1) > 0.1){
  primes <- primes +sum(isPrime(c(i*i-(i-1), i*i-2*(i-1), i*i-3*(i-1))))
  i <- i + 2
}
answer <- i

Much, much faster!

Lessons learned

There usually is a faster way to do things. Also, and that was a surprise. Getting the length of a 5385-element long vector is not that slower than evaluating 2*2693 – 1.

test <- runif(5385)
leng <- function(){
  length(test)
}

i <- 2693
mult <- function(){
  2*i - 1
}

library(microbenchmark)
mbm <- microbenchmark(mult, leng, times = 10000)
## Warning in microbenchmark(mult, leng, times = 10000): Could not measure a
## positive execution time for 71 evaluations.
mbm
## Unit: nanoseconds
##  expr min lq     mean median uq    max neval
##  mult   0  0 193.7454      1  1 373167 10000
##  leng   0  0 197.7259      0  1 767366 10000

Depending on what else is happening on the computer, it might even be faster to take the length than to do simple arithmetic.

library(ggplot2)
autoplot(mbm)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 9946 rows containing non-finite values (stat_ydensity).

plot of chunk unnamed-chunk-7

Not that it makes that much of a difference. But I find it interesting.

Project Euler 35

Project Euler 35 – Circular primes

197 is an example of a circular prime. It is itself a prime. And if we rotate the digits, 971 and 719 are also primes.

Given that 2, 3, 5 and 7 are by definition circular primes – not that we rotate that much, how many circular primes are there below one million?

Lets begin by generating a vector containing all primes below 1000000:

library(numbers)
prim <- Primes(1000000)
length(prim)
## [1] 78498

That is 78498 numbers. Is there any way to reduce that number?

There is. Any prime containg an even digit, will rotate to a number ending in an even digit. That number cannot be a prime.

The same applies to primes containing the digit 5.

Of course there are two special cases, 2 and 5.

Lets reduce the number of primes we need to consider. First make a function checking if there are any “illegal” digits in the number:

poss <- function(x){
  !any(unlist(strsplit(as.character(x),"")) %in% c(0,2,4,5,6,8))  
}

The function splits the number in individual digits, checks if they are in the list of forbidden digits, and return fals if any of the digits of the number is in the list of forbidden digits.

Now paring down the list of primes:

prim <- prim[sapply(prim,poss)]

That leaves us with 1111 possible primes.

Now we need to figure out how to rotate the numbers:

rotate <- function(x){
  res <- integer()
  while(length(res)<as.integer(log10(x)+1)){
    x <- x%/%10 + x%%10*(10**(as.integer(log10(x)+1)-1))
    res <- c(res,x)
  }
  all(res %in% prim)
}

Lets take a four digit number, 1234, as an example.

I begin by initializing a variable res to keep the rotated numbers.

as.integer(log10(x)+1) gives me the length of the number. Here, 4.

Rotating a four digit number will yield four numbers. As long as there are less than four numbers in the result-vector res, I need to rotate at least once more.

I want the first rotation to give the number 4123.

Performing an integer division by 10 gives me the first three digits of the number, here 123. Modulo 10 gives me the last, here 4.

I get the rotation by adding the first three digits of the number to the last number multiplied by 10 to the power of the number of digits in the original number minus 1.

103 is 1000, multiplying by 4 is 4000, adding that to 123 returns 4123. Repeat until there are four numbers in the result-vector.
Then check if all the elements of the result-vector are in the list of primes. If they all are, return TRUE, otherwise return FALSE.

Now it is simply a question of getting rid of all the remaining candidate primes that does not rotate to primes:

prim <- prim[sapply(prim,rotate)]

The lenght is the number of primes that are circular as defined above. I just need to add the two special cases 2 and 5, and I have the answer:

answer <- length(prim) + 2

Project Euler 46

Project Euler 46.

Christian Goldbach apparently proposed, that every odd composite number, can be written as the sum of a prime, and twice a square. As in:

9 = 7 + 2 * 12

I had to look up what a composite number is. It is basically a number that is not a prime.

The conjecture was false. And we should now find the smallest, odd, composite number, that can not be written as the sum of a prime and twice a square.

I was not able to find a shortcut, and apparently no one else has.

So I brute forced it.

I started with loading the trusted numbers, and purrr libraries:

library(numbers)
library(purrr)

Then I guessed, correctly as it turned out, that the number I’m looking for, is less than 100000.

That means, that the prime that can take part in the sum, must also be less than 100000. So – generating two vectors, one of primes below 100000, and candidate odd composite numbers below 100000:

prim <- Primes(100000)
test <- seq(3,100000,by=2)

1 is not included. I’m not sure if it is ruled out for not being a composite. But it is trival that it cannot be a sum of a prime and any thing else (positive). So I’m starting at 3.

Actually test does not contain only composite numbers. Getting rid of the primes is the next step:

test <- setdiff(test, prim)

setdiff() is rather useful. It takes two sets – vectors, and removes all the elements in the last from the first.

Next up is getting candidate squares.
As I’m squaring them, I don’t need to look at squares of numbers larger than 707. Since 708 squared and doubled is larger than 100000.

squares <- 1:707
squares <- 2*squares**2

Next I’m writing a function to test if any x can be expressed as the sum of twice a square and a prime.

x – squares will give me a vector with the result of substracting each of the elements in squares from x. I do not need to consider values in squares that are larger than x. Subtracting those from x would give a negative number – and that is not a prime.

To do that:

x – squares[squares<x]

Next, are the elements in that vector prime? The easy way is to test if they are in the list of primes. I use the %in% operator for that:

(x – squares[squares<x]) %in% prim

That will return a logical vector, with a value TRUE or FALSE for each element in the vector resulting from the subtraction, depending on whether it is in the list of primes. If any element is TRUE, we have a result.

The any() function does that. If any element in the vector is true, any() will return true.

Putting all of that into a function:

testing <- function(x){
  any((x - squares[squares<x]) %in% prim)
}

I now have a function that returns true, if a number x can be written as the sum of a prime and twice a square.

Finally I filter that through the discard() function – basically getting rid of all the numbers in my test-vector, that can be written as this sum. And finally passing those values to the min() function. And I have my answer:

answer <- test %>%
  discard(function(x) testing(x)) %>%
  min()