Obfuscate And Duplicate Google Analytics Data
Perhaps you didn’t know this, but there’s a really handy demo account for Google Analytics you can use to check out how Google Analytics works in a real business context (the data is from the Google Merchandise Store). However, you can access the account with nothing more than read-only access. This is annoying if you wanted to customize the setup.
Worry not, I have a solution for you! Harnessing the awesome power of customTask
, you can create a duplicate of the data collected on any website where you can modify the tracking (e.g. via Google Tag Manager). Even better, the data will be obfuscated using a dictionary of English words (you can edit this list), and hashing each string in the payload predictably against this dictionary.
As always, you can find this solution in my customTask Builder tool.
Huge thanks to Jaakko Ojalehto, my illustrious 8-bit-sheep developer colleague. He came up with the string replacement algorithm.
XThe Simmer Newsletter
Subscribe to the Simmer newsletter to get the latest news and content from Simo Ahava into your email inbox!
How to set it up
You’ll want to fetch the latest version of the code from the customTask Builder tool. See also the instructions for how to deploy the customTask
.
In Google Tag Manager, the Custom JavaScript variable will end up looking something like this:
function () {
// customTask Builder by Simo Ahava
//
// More information about customTask: https://www.simoahava.com/analytics/customtask-the-guide/
//
// Change the default values for the settings below.
// obfuscate: Obfuscates the entire hit payload (using a dictionary of words consistently) and dispatches it to the trackingId you provide.
// https://bit.ly/2RectUl
var obfuscate = {
tid: 'UA-12345-1',
dict: ['tumble', 'noble', 'flourish', 'abandon', 'liberal', 'team', 'conflict', 'collar', 'tiger', 'stun', 'grace', 'resource', 'phantom', 'imagine', 'information', 'hall', 'sweet', 'agriculture', 'bingo', 'relative'],
stringParams: ['uid','ua','dr','cn','cs','cm','ck','cc','ci','gclid','dclid','dl','dh','dp','dt','cd','cg[1-5]','linkid','an','aid','av','aiid','ec','ea','el','ti','ta','in','ic','iv','pr\\d{1,3}id','pr\\d{1,3}nm','pr\\d{1,3}br','pr\\d{1,3}ca','pr\\d{1,3}va','pr\\d{1,3}cc','pr\\d{1,3}cd\\d{1,3}','tcc','pal','col','il\\d{1,3}nm','il\\d{1,3}pi\\d{1,3}id','il\\d{1,3}pi\\d{1,3}nm','il\\d{1,3}pi\\d{1,3}br','il\\d{1,3}pi\\d{1,3}ca','il\\d{1,3}pi\\d{1,3}va','il\\d{1,3}pi\\d{1,3}cd\\d{1,3}','promo\\d{1,3}id','promo\\d{1,3}nm','promo\\d{1,3}cr','promo\\d{1,3}ps','sn','sa','st','utc','utv','utl','exd','cd\\d{1,3}','xid','exp','_utmz'],
priceParams: ['tr','ts','tt','ip','pr\\d{1,3}pr','id\\d{1,3}pi\\d{1,3}pr'],
priceModifier: Math.random(),
medium: ['organic', 'referral', 'social', 'cpc'],
replaceString: function(t){if(''===t)return t;'function'==typeof window.btoa&&(t=btoa(t));var n=t.split('').map(function(t){return t.charCodeAt(0)}).join('')%obfuscate.dict.length;return obfuscate.dict[n]},
init: function(){var c=[];obfuscate.dict.forEach(function(t){obfuscate.dict.forEach(function(o){c.push(t+'-'+o)})}),obfuscate.dict=obfuscate.dict.concat(c)}
};
// DO NOT EDIT ANYTHING BELOW THIS LINE
if (typeof obfuscate === 'object' && typeof obfuscate.init === 'function') obfuscate.init();
var readFromStorage = function (key) {
if (!window.Storage) {
// From: https://stackoverflow.com/a/15724300/2367037
var value = '; ' + document.cookie;
var parts = value.split('; ' + key + '=');
if (parts.length === 2) {
return parts.pop().split(';').shift();
}
} else {
return window.localStorage.getItem(key);
}
};
var writeToStorage = function (key, value, expireDays) {
if (!window.Storage) {
var expiresDate = new Date();
expiresDate.setDate(expiresDate.getDate() + expireDays);
document.cookie = key + '=' + value + ';expires=' + expiresDate.toUTCString();
} else {
window.localStorage.setItem(key, value);
}
};
var globalSendHitTaskName = '_ga_originalSendHitTask';
return function (customTaskModel) {
window[globalSendHitTaskName] = window[globalSendHitTaskName] || customTaskModel.get('sendHitTask');
customTaskModel.set('sendHitTask', function (sendHitTaskModel) {
var originalSendHitTaskModel = sendHitTaskModel,
originalSendHitTask = window[globalSendHitTaskName],
canSendHit = true;
try {
if (canSendHit) {
originalSendHitTask(sendHitTaskModel);
}
// obfuscate
if (typeof obfuscate === 'object' && obfuscate.hasOwnProperty('tid') && obfuscate.hasOwnProperty('dict') && obfuscate.hasOwnProperty('stringParams') && obfuscate.hasOwnProperty('priceParams') && obfuscate.hasOwnProperty('replaceString') && obfuscate.hasOwnProperty('priceModifier')) {
var _o_hitPayload = sendHitTaskModel.get('hitPayload');
obfuscate.stringParams.forEach(function(strParam) {
var regexParam = new RegExp('[?&]' + strParam + '=[^&]+', 'g');
var paramsInHitpayload = _o_hitPayload.match(regexParam) || [];
paramsInHitpayload.forEach(function(keyValue) {
var parts = keyValue.split('=');
var urlParts = parts[1].split('%2F').map(function(urlPart) {
if (/https?:/.test(decodeURIComponent(urlPart))) return urlPart;
return urlPart.split('%20').map(function(wordPart) {
return obfuscate.replaceString(wordPart);
}).join('%20');
}).join('%2F');
_o_hitPayload = _o_hitPayload.replace(parts.join('='), parts[0] + '=' + urlParts);
});
});
obfuscate.priceParams.forEach(function(prParam) {
var regexParam = new RegExp('[?&]' + prParam + '=[^&]+', 'g');
var paramsInHitpayload = _o_hitPayload.match(regexParam) || [];
paramsInHitpayload.forEach(function(keyValue) {
var parts = keyValue.split('=');
var price = parseFloat(parts[1]) || 0.00;
price = (price * obfuscate.priceModifier).toFixed(2);
_o_hitPayload = _o_hitPayload.replace(parts.join('='), parts[0] + '=' + price);
});
});
_o_hitPayload = _o_hitPayload
.replace(
'&tid=' + sendHitTaskModel.get('trackingId') + '&',
'&tid=' + obfuscate.tid + '&'
)
.replace(/[?&]aip($|&|=[^&]*)/, '')
.replace(/[?&]c[sm]=[^&]*/g, '')
.replace(/[?&]uip=[^&]*/g, '');
if (Math.random() <= 0.10) {
_o_hitPayload +=
'&cs=' + obfuscate.dict[Math.floor(Math.random()*obfuscate.dict.length)] +
'&cm=' + obfuscate.medium[Math.floor(Math.random()*obfuscate.medium.length)];
}
_o_hitPayload += '&uip=' +
(Math.floor(Math.random() * 255) + 1) + '.' +
(Math.floor(Math.random() * 255) + 0) + '.' +
(Math.floor(Math.random() * 255) + 0) + '.' +
(Math.floor(Math.random() * 255) + 0);
_o_hitPayload += '&aip=1';
sendHitTaskModel.set('hitPayload', _o_hitPayload, true);
originalSendHitTask(sendHitTaskModel);
}
// /obfuscate
} catch(err) {
originalSendHitTask(originalSendHitTaskModel);
}
});
};
}
That’s quite a bit of code, because it turns out that obfuscating the data consistently and taking care of all the other possible pitfalls with duplicating Google Analytics data isn’t exactly trivial.
Anyway, to set the thing up, you’ll need to edit the configuration object within the var obfuscate = {...}
block. Here are the configuration keys and how to use them. Note! All the keys are required for the solution to work. If you remove one of the keys, obfuscation will be aborted.
Key | Initial value | Description |
---|---|---|
trackingId |
UA-12345-1 |
The Tracking ID to which you want the data to be dispatched. Only one tracking ID is supported at this time. |
dict |
['tumble', 'noble'...] |
The dictionary of words that will be used. Don’t add too many (20 should suffice). When the function is initialized, it will automatically generate compound words from every item in the dictionary. |
stringParams |
['uid','ua'...] |
All the Measurement Protocol parameters that will be treated as strings and will be replaced with words in the dictionary. The parameter names are regular expression patterns. |
priceParams |
['tr','ts'...] |
All the Measurement Protocol parameters that will be treated as prices and will be modified with the priceModifier value (see below). The parameter names are regular expression patterns. |
priceModifier |
Math.random() |
The modifier which will be used to modify all prices in the payload. The initial value (Math.random() ) basically means that prices will be modified with a random percentage between 0.00 and 1.00. |
medium |
['organic', 'referral'...] |
The list of campaign media that will be randomly assigned to 10% of hits (to get some source/medium variance). |
replaceString |
function |
Internal function, do not modify. |
init |
function |
Internal function, do not modify. |
You’ll want to edit trackingId
at the very least. the other configurations have completely functional default values, so there’s no need to touch them unless you want to. For example, you might want to rewrite the dict
to include words that actually have to do with some real industry.
To get most out of your data, you’ll want to add this customTask
to all the hits dispatched to a Google Analytics property from your website. That way you’ll get the most comprehensive and realistic data set.
How it works
The obfuscation itself is fairly complex.
First, when tag is first run, the obfuscator is initialized. This initialization basically takes your dictionary of words, and generates a compound of every word against every other word in the dictionary. Thus the final length of the dictionary is n + n^2
squared, where n
is the initial length of the dictionary. For example, if this is your initial dictionary:
['baby', 'rock', 'sweet']
The final dictionary will be:
['baby', 'rock', 'sweet', 'baby-baby', baby-rock', 'baby-sweet', 'rock-baby', 'rock-rock', 'rock-sweet', 'sweet-baby', 'sweet-rock', 'sweet-sweet']
The obfuscation itself is a multi-step process.
- First, all string parameters from the configuration are looped through. If a match is made in the payload, then the value of the string parameter is first turned into a Base64 representation, and then a simple algorithm is used to turn this encoded string into a number, which is then compressed into an index number of the dictionary.
obfuscate.stringParams.forEach(function(strParam) {
var regexParam = new RegExp('[?&]' + strParam + '=[^&]+', 'g');
var paramsInHitpayload = _o_hitPayload.match(regexParam) || [];
paramsInHitpayload.forEach(function(keyValue) {
var parts = keyValue.split('=');
var urlParts = parts[1].split('%2F').map(function(urlPart) {
if (/https?:/.test(decodeURIComponent(urlPart))) return urlPart;
return urlPart.split('%20').map(function(wordPart) {
return obfuscate.replaceString(wordPart);
}).join('%20');
}).join('%2F');
_o_hitPayload = _o_hitPayload.replace(parts.join('='), parts[0] + '=' + urlParts);
});
});
This means that every single string will have a consistent counterpart in the dictionary. Some strings will naturally return the same dictionary word, but that’s ok since we’re not going for perfect traceability here, and this will also make it even more difficult to reverse-engineer the translated strings back to their original representations.
If the string is found to have a /
symbol, then each word separated by the slash will be translated separately. This way URLs will be kept intact. In a similar vein, if the string has http:
or https:
, then the protocol will not be translated, because GA requires valid URLs in certain parameters.
Finally, if the strings are comprised of words (separated by whitespace), then each word is translated separately.
- Next, the price parameters are matched in a similar way against the hit payload. If a match is made, then the price is modified by the
priceModifier
from the configuration. Each price using this tracker is modified with the same modifier.
obfuscate.priceParams.forEach(function(prParam) {
var regexParam = new RegExp('[?&]' + prParam + '=[^&]+', 'g');
var paramsInHitpayload = _o_hitPayload.match(regexParam) || [];
paramsInHitpayload.forEach(function(keyValue) {
var parts = keyValue.split('=');
var price = parseFloat(parts[1]) || 0.00;
price = (price * obfuscate.priceModifier).toFixed(2);
_o_hitPayload = _o_hitPayload.replace(parts.join('='), parts[0] + '=' + price);
});
});
- Then, the tracking ID in the payload is replaced by the one you provide in the configuration object. At the same time, the parameters
aip
,cs
,cm
, anduip
(for Anonymize IP, Campaign Source, Campaign Medium, and Override IP, respectively) are removed from the payload.
_o_hitPayload = _o_hitPayload
.replace(
'&tid=' + sendHitTaskModel.get('trackingId') + '&',
'&tid=' + obfuscate.tid + '&'
)
.replace(/[?&]aip($|&|=[^&]*)/, '')
.replace(/[?&]c[sm]=[^&]*/g, '')
.replace(/[?&]uip=[^&]*/g, '');
- Finally, 10% of all hits are assigned a random campaign source (from the dictionary), with a random medium from the list of
medium
you provided in the configuration.
Also, a random IP address is generated for the hit. Yes, every hit.
Then, the IP address is anonymized with the Anonymize IP parameter.
if (Math.random() <= 0.10) {
_o_hitPayload +=
'&cs=' + obfuscate.dict[Math.floor(Math.random()*obfuscate.dict.length)] +
'&cm=' + obfuscate.medium[Math.floor(Math.random()*obfuscate.medium.length)];
}
_o_hitPayload += '&uip=' +
(Math.floor(Math.random() * 255) + 1) + '.' +
(Math.floor(Math.random() * 255) + 0) + '.' +
(Math.floor(Math.random() * 255) + 0) + '.' +
(Math.floor(Math.random() * 255) + 0);
_o_hitPayload += '&aip=1';
Modifying the IP addresses like this leads to interesting data in the list of service providers:
The last thing that happens is that the hit is dispatched to the Tracking ID you provided.
sendHitTaskModel.set('hitPayload', _o_hitPayload, true);
originalSendHitTask(sendHitTaskModel);
Caveats
It’s not a perfect duplication of the data. Here are some the things the script has trouble with:
-
All campaign information from the original hit is removed. So the assignment of source/medium information will not follow the logic of the original account. To counter this, I generate a random source/medium to 10% of all hits.
-
The prices are modified with the same percentage, not the same value. Thus, if you have a Transaction Revenue of
10.00
and Product Revenue of8.00
, and the modifier is0.8
, the end result will be a Transaction Revenue of8.00
and Product Revenue of6.40
. This means that someone could deduce what the original price was, if they assumed, for example, that Transaction Revenue is the total sum of all Product Revenus multiplied by their quantities (as it often is). -
No integer values are modified. So Custom Metrics, Event Values, quantities, and so forth are not touched. I did this because I don’t think integers encode information that could be used to identify the original source of the data. Prices are modified because with a specific set of prices a user could guess what the origin of the data was, but not so much with integers. I’m happy to modify this in the future if enough people think it’s necessary.
Final thoughts
Whether this solution is useful not, I can guarantee that writing it was a lot of fun! Just obfuscating the data would have been easy. Just mask every string with a random GUID or something. But trying to figure out a dictionary substitution was far more difficult.
The algorithm I chose (with Jaakko Ojalehto’s help) for the replacement isn’t perfect. The distribution isn’t even. But I think that’s OK. You’ll only end up with 420 words by default anyway, so there’s going to be a LOT of overlap nevertheless, since even a simple site will produce far more than 420 unique strings in the data.
Even if you don’t find this dataset useful, I’ll guarantee you’ll have fun looking at the string combinations produced by the replacement algorithm. In fact, I had to modify the dictionary I initially had, because it resulted in compounds like beat-child sweet-laughter
which I think might raise some eyebrows when the data is displayed in a training session.
Let me know in the comments if this solution needs improvement!