deicide Absurdistan Bankistan

Transcription

deicide
deicIDE
|ˈdeɪɪsʌɪd, ˈdiːɪ-|
!
!
The Rise and Fall of
Integrated Development Environments
The killing of a god
Michele Lanza
REVEAL | Faculty of Informatics
University of Lugano, Switzerland
!
Part I
Inception
!
Part II
Ascension
!
Part III
Transcendence
Bankistan
R
Absurdistan
E
V
E
A
L
de gustibus
non est
disputandum
Tool == Cool
Bugatti Veyron 16.4
987 BHP
408 km/h
Weapons of
Mass Instruction
Part I
Inception
Shelby SuperCars Ultimate Aero TT
1,062 BHP
412 km/h
Bugatti Veyron Super Sport
1,184 BHP
431 km/h
Volkswagen Golf V GTI
?
408 + 235 = 431
197 BHP
235 km/h
Edit
Run
Compile
I
D
E
ntegrated
evelopment
nvironment
It’s time
for new
IDEas
The IDE
must DIE
Software is a collection of computer
programs and related data that provides the
instructions for telling a computer what to do
and how to do it. In contrast to hardware,
software “cannot be touched”.
What is Software?
Programming is a kind of writing.
!
Gerald Weinberg
!
Psychology of Computer Programming
!
Dorset House, 1971
/***************************************************************************/
/*
micro-Max,
*/
/* A chess program smaller than 2KB (of non-blank source), by H.G. Muller */
/***************************************************************************/
/* version 3.2 (2000 characters) features:
*/
/* - recursive negamax search
*/
/* - quiescence search with recaptures
*/
/* - recapture extensions
*/
/* - (internal) iterative deepening
*/
/* - best-move-first 'sorting'
*/
/* - a hash table storing score and best move
*/
/* - full FIDE rules (expt minor ptomotion) and move-legality checking
*/
!
#define F(I,S,N) for(I=S;I<N;I++)
#define W(A) while(A)
#define K(A,B) *(int*)(T+A+(B&8)+S*(B&7))
#define J(A) K(y+A,b[y])-K(x+A,u)-K(H+A,t)
!
!
!
#define U 16777224
struct _ {int K,V;char X,Y,D;} A[U];
/* hash table, 16M+8 entries*/
int V=112,M=136,S=128,I=8e4,C=799,Q,N,i;
/* V=0x70=rank mask, M=0x88 */
!
!
/* piece symbols on printout*/
char O,K,L,
w[]={0,1,1,3,-1,3,5,9},
/* relative piece values
*/
o[]={-16,-15,-17,0,1,16,0,1,16,15,17,0,14,18,31,33,0, /* step-vector lists */
7,-1,11,6,8,3,6,
/* 1st dir. in o[] per piece*/
6,3,5,7,4,5,3,6},
/* initial piece setup
*/
b[129],
/* board: half of 16x8+dummy*/
T[1035],
/* hash translation table
*/
n[]=".?+nkbrq?*?NKBRQ";
programming
is writing
programming
is writing
D(k,q,l,e,J,Z,E,z,n)
/* recursive minimax search, k=moving side, n=depth*/
int k,q,l,e,J,Z,E,z,n; /* (q,l)=window, e=current eval. score, E=e.p. sqr.*/
{
/* e=score, z=prev.dest; J,Z=hashkeys; return score*/
int j,r,m,v,d,h,i=9,F,G;
char t,p,u,x,y,X,Y,H,B;
struct _*a=A;
/* lookup pos. in hash table*/
j=(k*E^J)&U-9;
/* try 8 consec. locations */
W((h=A[++j].K)&&h-Z&&--i);
/* first empty or match
*/
a+=i?j:0;
/* dummy A[0] if miss & full*/
if(a->K)
/* hit: pos. is in hash tab */
{d=a->D;v=a->V;X=a->X;
/* examine stored data
*/
if(d>=n)
/* if depth sufficient:
*/
{if(v>=l|X&S&&v<=q|X&8)return v;
/* use if window compatible */
d=n-1;
/* or use as iter. start
*/
}X&=~M;Y=a->Y;
/*
with best-move hint */
Y=d?Y:0;
/* don't try best at d=0
*/
}else d=X=Y=0;
/* start iter., no best yet */
N++;
/* node count (for timing) */
W(d++<n|z==8&N<1e7&d<98)
/* iterative deepening loop */
{x=B=X;
/* start scan at prev. best */
Y|=8&Y>>4;
/* request try noncastl. 1st*/
m=d>1?-I:e;
/* unconsidered:static eval */
do{u=b[x];
/* scan board looking for
*/
if(u&k)
/* own piece (inefficient!)*/
{r=p=u&7;
/* p = piece type (set r>0) */
j=o[p+16];
/* first step vector f.piece*/
W(r=p>2&r<0?-r:-o[++j])
/* loop over directions o[] */
{A:
/* resume normal after best */
y=x;F=G=S;
/* (x,y)=move, (F,G)=castl.R*/
do{H=y+=r;
/* y traverses ray
*/
if(Y&8)H=y=Y&~M;
/* sneak in prev. best move */
if(y&M)break;
/* board edge hit
*/
software
is text
if(p<3&y==E)H=y^16;
/* shift capt.sqr. H if e.p.*/
t=b[H];if(t&k|p<3&!(r&7)!=!t)break;
/* capt. own, bad pawn mode */
i=99*w[t&7];
/* value of capt. piece t
*/
if(i<0||E-S&&b[E]&&y-E<2&E-y<2)m=I;
/* K capt. or bad castling */
if(m>=l)goto C;
/* abort on fail high
*/
if(h=d-(y!=z))
/* remaining depth(-recapt.)*/
{v=p<6?b[x+8]-b[y+8]:0;
/* center positional pts.
*/
b[G]=b[H]=b[x]=0;b[y]=u&31;
/* do move, strip virgin-bit*/
if(!(G&M)){b[F]=k+6;v+=30;}
/* castling: put R & score */
if(p<3)
/* pawns:
*/
{v-=9*(((x-2)&M||b[x-2]!=u)+
/* structure, undefended
*/
((x+2)&M||b[x+2]!=u)-1);
/*
squares plus bias */
if(y+r+1&S){b[y]|=7;i+=C;}
/* promote p to Q, add score*/
}
v=-D(24-k,-l-(l>e),m>q?-m:-q,-e-v-i,
/* recursive eval. of reply */
J+J(0),Z+J(8)+G-S,F,y,h);
/* J,Z: hash keys
*/
v-=v>e;
/* delayed-gain penalty
*/
if(z==9)
/* called as move-legality */
{if(v!=-I&x==K&y==L)
/*
checker: if move found */
{Q=-e-i;O=F;return l;}
/*
& not in check, signal */
v=m;
/* (prevent fail-lows on
*/
}
/*
K-capt. replies)
*/
b[G]=k+38;b[F]=b[y]=0;b[x]=u;b[H]=t;
/* undo move,G can be dummy */
if(Y&8){m=v;Y&=~8;goto A;}
/* best=1st done,redo normal*/
if(v>m){m=v;X=x;Y=y|S&G;}
/* update max, mark with S */
}
/*
if non castling */
t+=p<5;
/* fake capt. for nonsliding*/
if(p<3&6*k+(y&V)==S
/* pawn on 3rd/6th, or
*/
||(u&~24)==36&j==7&&
/* virgin K moving sideways,*/
G&M&&b[G=(x|7)-(r>>1&7)]&32
/* 1st, virgin R in corner G*/
&&!(b[G^1]|b[G^2])
/* 2 empty sqrs. next to R */
){F=y;t--;}
/* unfake capt., enable e.p.*/
}W(!t);
/* if not capt. continue ray*/
}}}W((x=x+9&~M)-B);
/* next sqr. of board, wrap */
C:if(m>I/4|m<-I/4)d=99;
/* mate is indep. of depth */
m=m+I?m:-D(24-k,-I,I,0,J,Z,S,S,1)/2;
/* best loses K: (stale)mate*/
if(!a->K|(a->X&M)!=M|a->D<=d)
/* if new/better type/depth:*/
{a->K=Z;a->V=m;a->D=d;A->K=0;
/* store in hash,dummy stays*/
a->X=X|8*(m>q)|S*(m<l);a->Y=Y;
/* empty, type (limit/exact)*/
}
/*
encoded in X S,8 bits */
/*if(z==8)printf("%2d ply, %9d searched, %6d by (%2x,%2x)\n",d-1,N,m,X,Y&0x77);*/
}
if(z&8){K=X;L=Y&~M;}
return m;
}
!
main()
{
int j,k=8,*p,c[9];
!
F(i,0,8)
{b[i]=(b[i+V]=o[i+24]+40)+8;b[i+16]=18;b[i+96]=9;
F(j,0,8)b[16*j+i+8]=(i-4)*(i-4)+(j-3.5)*(j-3.5);
}
F(i,M,1035)T[i]=random()>>9;
!
W(1)
{F(i,0,121)printf(" %c",i&8&&(i+=7)?10:n[b[i]&15]);
p=c;W((*p++=getchar())>10);
N=0;
if(*c-10){K=c[0]-16*c[1]+C;L=c[2]-16*c[3]+C;}else
D(k,-I,I,Q,1,1,O,8,0);
F(i,0,U)A[i].K=0;
if(D(k,-I,I,Q,1,1,O,9,2)==I)k^=24;
}
/* initial board setup*/
/* center-pts table
*/
/*(in unused half b[])*/
/* play loop
/* print board
/* read input line
*/
*/
*/
/* parse entered move */
/* or think up one
*/
/* clear hash table
*/
/* check legality & do*/
}
System
Package/Subsystem/Namespace
Constructing
What is the
ideal IDE?
Class Description
Writing
Method/Function Signature
Method/Function Body
Writing
Line/Statement
Code Bubbles
(Courtesy of Steve Reiss
Brown University, USA)
Code Canvas
!
(Courtesy of Rob De Line
Microsoft Research, USA)
Gaucho by Fernando Olivero
!
Flexitools 2010, ECOOP 2011, VL/HCC 2011
!
The programmer, like the poet, works
only slightly removed from thought-stuff.
He builds his castles in the air, from air,
creating by exertion of the imagination.
!
Frederick P. Brooks, Jr.
!
!
The Mythical Man-Month
!
Addison-Wesley, 1975
ThrustSSC
We acquire more information
through vision than through
all the other senses combined.
!
Colin Ware
Part II
!
!
Information Visualization
Ascension
Perception for Design
!
Morgan Kaufmann, 2004
110,000 BHP
1,227 km/h
UML is software
g
n
o
visualization
wr
(software)
visualization myths
a picture is
worth
g
n
o
a thousand
wr words
visual programming
g is
n
o
software
wrvisualization
g
n
o
wr
static
visualization
g
n
o
wr
software visualization
g
n
o
wr
dynamic
visualization
about
software
William Playfair
Jacques Bertin
1759 - 1823
Semiology of Graphics,1967
software
information
of
software
visualization
visualization
visualization ~ story
Stephen Few
Show Me the Numbers
!
Analytics Press, 2004
Edward Tufte
The Visual Display of Quantitative Information
!
Morgan Kaufmann, 2004
It was the twenty-eight of November. An
immense confused mass of men, horses,
vehicles besieged the narrow entrances to
the bridges and began to flow over them.
Those in front, pushed by the weight of those behind were
crushed, trampled on, or forced into the ice-filled water of the
Berezina.
The confusion was so great that when Napoleon himself wished
to cross, it was necessary to use force to clear a passage.
Some there were who, determined to pass at all costs, cut a
horrible way for themselves with their swords. Others opened
an even crueler road for their carriages, driving them pitilessly
through the helpless crowd, crushing men and women, in their
odious greed sacrificing their companions in misery.
Metaphors
Software is intangible,
having no physical shape or size.
Thomas Ball, Stephen Eick
!
Software Visualization in the
Large
!
In Computer, vol. 29, no. 4, pp.
33-43, IEEE Computer Society
Press, 1996
Habitability is
the characteristic of source code that
enables programmers [..] to understand
its construction and intentions and to
change it comfortably and confidently.
Software systems as cities
is a versatile metaphor which enables the
creation of efficient software visualizations
to support reverse engineering.
!
Richard Wettel
!
Richard P. Gabriel
!
!
!
!
Software Systems as Cities
Patterns of Software
!
PhD Thesis, University of Lugano, 2010
!
Oxford University Press, 1998
Software Cities
The City Metaphor
Program Comprehension
System
Language
ScummVM C++
ArgoUML
package ~ district
nesting level ~ color
class ~ building
methods (NOM) ~ height
attributes (NOA) ~ width, length
lines (LOC) ~ color
LOC
136'325
NOP
141
NOC kLOC
3'117
305
System
Language
ScummVM C++
CodeCity
Smalltalk
NOP
NOC kLOC
System
141
3'117
305
ScummVM C++
34
173
29
CodeCity
Language
Smalltalk
iTextSharp C#
System
Language
NOC kLOC
System
141
3'117
305
ScummVM C++
34
173
29
CodeCity
iTextSharp C#
22
485
58
iText
Java
36
566
Jmol
Java
50
558
ScummVM C++
CodeCity
System
Smalltalk
Language
NOP
System
3'117
305
ScummVM C++
34
173
29
CodeCity
22
485
58
305
34
173
29
iTextSharp C#
22
485
58
iText
Java
36
566
59
Language
Smalltalk
System
ScummVM C++
34
173
29
CodeCity
iTextSharp C#
22
485
58
59
iText
Java
36
566
85
Jmol
Java
50
jEdit
Java
59
System
305
ScummVM C++
34
173
29
CodeCity
iTextSharp C#
22
485
58
iText
Java
36
566
Jmol
Java
50
jEdit
Java
ArgoUML
GWT
NOP
NOP
NOC kLOC
141
3'117
305
34
173
29
iTextSharp C#
22
485
58
59
iText
Java
36
566
59
558
85
Jmol
Java
50
558
85
966
98
jEdit
Java
59
966
98
ArgoUML
Java
88
1'817
144
Language
Smalltalk
NOC kLOC
System
141
3'117
305
ScummVM C++
34
173
29
CodeCity
iTextSharp C#
22
485
58
59
iText
Java
36
566
558
85
Jmol
Java
50
59
966
98
jEdit
Java
Java
88
1'817
144
ArgoUML
Java
Java
302
4'372
212
GWT
JBoss
Smalltalk
Language
NOC kLOC
3'117
305
NOC kLOC
NOP
141
NOC kLOC
Smalltalk
NOP
Language
3'117
3'117
CodeCity
NOC kLOC
141
141
141
ScummVM C++
NOP
Language
NOP
NOP
NOC kLOC
141
3'117
305
34
173
29
iTextSharp C#
22
485
58
59
iText
Java
36
566
59
558
85
Jmol
Java
50
558
85
59
966
98
jEdit
Java
59
966
98
88
1'817
144
ArgoUML
Java
88
1'817
144
Java
302
4'372
212
GWT
Java
302
4'372
212
Java
1'507
7'881
435
JBoss
Java
1'507
7'881
435
JDK 1.5
Java
Smalltalk
Smalltalk
664 12'888 1'085
iText
System
Language
ScummVM C++
NOP
NOC kLOC
Java
59 kLOC
iText
Eclipse
CodeCity
C#
Java
Smalltalk
58 kLOC
2,871 kLOC
29 kLOC
System
Language
141
3'117
305
34
173
29
CodeCity
iTextSharp C#
22
485
58
iText
Java
36
566
59
Jmol
Java
50
558
85
jEdit
Java
59
966
98
ArgoUML
Java
88
1'817
144
GWT
Java
302
4'372
212
JBoss
Java
1'507
7'881
435
JDK 1.5
Java
664 12'888 1'085
Eclipse
Java
1'800 27'900 2'871
CodeCity
Smalltalk
NOP
ScummVM C++
ArgoUML
Java
jEdit
144 kLOC
NOC kLOC
141
3'117
305
34
173
29
iTextSharp C#
22
485
58
iText
Java
36
566
59
Jmol
Java
50
558
85
jEdit
Java
59
966
98
ArgoUML
Java
88
1'817
144
GWT
Java
302
4'372
212
JBoss
Java
1'507
7'881
435
JDK 1.5
Java
664 12'888 1'085
Eclipse
Java
1'800 27'900 2'871
Smalltalk
Java
98 kLOC
Jmol
Java
JDK
85 kLOC
Java
ScummVM
1,085 kLOC
C++
305 kLOC
JBoss
Java
435 kLOC
GWT
Java
212 kLOC
OpenSwing Java
Software Evolution Analysis
Ver. 0.12
18/08/2003
Ver. 0.10.1
9/10/2002
CPPParser
JavaRecognizer
Ver. 0.16
19/07/2004
Facade
FacadeMDRImpl
Facade
112
ModelFacade
JavaTokenType
NSUMLModelFacade
1'754
Coarse-grained Age Map
Ver. 0.14
5/12/2003
ModelFacade
268
FacadeMDRImpl
Facade
NSUMLModelFacade
Ver. 0.18.1
30/04/2005
language.java.generator
AGE:
8
NOM:
0
NOA:
146
reveng.classfile
AGE:
3
NOM:
200
NOA:
85
AGE: 8
NOM: 91
NOA: 24
STDCTokenType
reveng.classfile
AGE:
3
NOM:
0
NOA:
152
Ver. 0.20
9/02/2006
JavaRecognizer
AGE:
8
NOM: 176
NOA:
79
Ver. 0.23.4
10/12/2006
Ver. 0.22
8/08/2006
Ver. 0.24
12/02/2007
Fine-grained Age Map
JavaTokenType
1
reveng.java
AGE:
8
NOM:
0
NOA:
175
2
3
4
5
6
Coarse-grained Time Travel
age: 1
2 3 4 5 6 7 8
updated often,
rather unstable
stable
highly unstable
ArgoUML
years
major releases
6
8
rarely updated
1
2
3
4
5
6
7
8
very old
old
young
very old
0.10.1
2002
0.12 0.14
2003
0.16
2004
0.18.1
2005
0.20
0.22
2006
0.24
2007
7
8
ModelFacade
model
LOC: 1'280
NOM:
183
NOA:
60
0.10.1
2002
0.12 0.14
2003
0.16
2004
0.18.1
0.20
2005
0.22
0.24
2006
0.10.1
2007
2002
0.12 0.14
2003
0.16
2004
0.18.1
0.20
2005
0.22
0.24
2006
0.10.1
2007
2002
0.12 0.14
2003
0.16
0.18.1
2004
2005
NSUMLModelFacade
model
LOC: 3'275
NOM:
426
NOA:
108
0.10.1
2002
0.12 0.14
2003
0.16
2004
0.18.1
0.20
2005
0.22
0.24
2006
0.10.1
2007
2002
0.12 0.14
2003
0.16
2004
Facade
FacadeMDRImpl
0.18.1
0.20
2005
0.22
Facade
model
LOC:
NOM:
NOA:
2002
0.12 0.14
2003
0.16
2004
0.18.1
2005
0.20
0.22
2006
0.24
0.24
2006
2007
2002
0.12 0.14
2003
0.16
2004
0.18.1
2005
0.20
Facade
0.22
2006
2006
2007
Facade
model
LOC: 2'709
NOM:
329
NOA:
2
0.10.1
2007
FacadeMDRImpl
0.10.1
0.24
FacadeMDRImpl
0
306
1
2002
0.12 0.14
2003
0.16
0.18.1
2004
0.20
2005
0.22
0.24
2006
2007
Fine-grained Time Travel
JHotDraw
years
revisions
0.10.1
0.22
NSUMLModelFacade
model
LOC: 2'102
NOM:
316
NOA:
2
ModelFacade
0.20
5
8
0.24
2007
2000
2001
2002
2003
2004
3
1
2000
2001
2002
2003
2004
2000
2001
2002
2003
2
2
1
1
2004
2000
2001
2002
2003
2004
6
2000
2001
2002
2003
5
5
4
4
4
3
3
3
2
2
2
1
1
1
2004
2000
2001
2002
2003
2004
2000
8
2000
2001
2002
2003
2004
7
7
6
6
5
5
4
4
3
3
2
2
1
1
2000
2001
2002
2003
2004
2001
2002
2003
2004
Problem Detection & Analysis
Metrics? Don’t trust them
!
Michele Lanza & Radu Marinescu
!
!
Object-Oriented Metrics in Practice
Using Software Metrics to
Characterize, Evaluate, and Improve
the Design of Object-Oriented Systems
!
ArgoUML classes
Springer, 2006
brain class
8
god class
30
god + brain
6
data class
17
unaffected
1'715
JDK 1.5 god classes
Facade
NOM 140/337
AggregationKind
NOM 3/3
VisibilityKind
NOM 4/4
PseudostateKind
NOM 6/6
Model
NOM 28/44
Shotgun surgery
JMol Feature Envy
useful?
CodeCity
codecity.inf.usi.ch
Interlude
Chains of Academia
“I am not convinced I would
learn anything from the
visualizations”
In the Name of Science
SE
C
D
I
TE
C
JE
RE
Design
Execution
Knock me down, I’ll just come back running
Results
Design
State of the Art Survey
Experiment Design Desiderata
Experiment Design Desiderata
1 Avoid comparing using a technique against not using it.
1 Avoid comparing using a technique against not using it.
2 Involve participants from the industry.
2 Involve participants from the industry.
3 Provide a not-so-short tutorial of the experimental tool to the participants.
3 Provide a not-so-short tutorial of the experimental tool to the participants.
4 Avoid, whenever possible, giving the tutorial right before the experiment.
4 Avoid, whenever possible, giving the tutorial right before the experiment.
5 Use the tutorial to cover both the research behind the approach and the tool.
5 Use the tutorial to cover both the research behind the approach and the tool.
6 Find a set of relevant tasks.
6 Find a set of relevant tasks.
7 Choose real object systems that are relevant for the tasks.
7 Choose real object systems that are relevant for the tasks.
8 Include more than one object system in the design.
8 Include more than one object system in the design.
9 Provide the same data to all participants.
Finding a Baseline
9 Provide the same data to all participants.
10 Limit the amount of time allowed for solving each task.
10 Limit the amount of time allowed for solving each task.
11 Provide all the details needed to make the experiment replicable.
11 Provide all the details needed to make the experiment replicable.
12 Report results on individual tasks.
12 Report results on individual tasks.
13 Include tasks whose expected result is not to the advantage of the tool being evaluated.
13 Include tasks whose expected result is not to the advantage of the tool being evaluated.
14 Take into account the possible wide range of experience level of the participants.
14 Take into account the possible wide range of experience level of the participants.
1. program comprehension
2. design quality assessment
3. system evolution analysis
Research Questions
1
Research Questions
1
2
Does the use of CodeCity increase the correctness
of the solutions to program comprehension tasks,
compared to non-visual exploration tool, regardless
of the object system size?
Tasks
Tasks
Does the use of CodeCity increase the correctness
of the solutions to program comprehension tasks,
compared to non-visual exploration tool, regardless
of the object system size?
Does the use of CodeCity reduce the time needed to
solve program comprehension tasks, compared to
non-visual exploration tools, regardless of the object
system size?
A1
Identity the convention used in the system to organize unit tests.
A2.1&
A2.2
What is the spread of term T in the name of the classes, their attributes and
methods?
A3
Evaluate the change impact of class C, in terms of intensity and dispersion.
A4.1
Find the three classes with the highest number of methods.
A4.2
Find the three classes with the highest average number of lines of code per
method.
B1.1
Identify the package with the highest percentage of god classes.
B1.2
Identify the god class with the largest number of methods.
B2.1
Identify the dominant (affecting the highest number of classes) class-level design
problem.
B2.2
Write an overview of the class-level design problems in the system.
Tasks
Program Comprehension
6
Quantitative
A1
A2.1&
A2.2
methods?
A2.1&
A2.2
methods?
A3
A3
A4.1
A4.1
A4.2
method.
A4.2
method.
B1.1
B1.1
B1.2
B1.2
B2.1
problem.
B2.1
problem.
B2.2
B2.2
Design Quality Assessment
4
Experiment Runs
Qualitative
Testing the Waters
Execution
9
A1
1
Experiment Timeline
2009
November
day 1
day 2
day 3
day 4
18 24 25
2
9
3
1
1
1
November
18 24 25
1
1
1
Lugano
1
2
9
3
1
1
1
21
January
28
5
8
14
February
28
... April
18
22 24 25
1
1
3
14
1
1
1
remote
sessions
Bologna
experiment session
(2 hours)
December
time
Lugano
training session
(1 hour)
2009 2010
December
e1
e2
c1
c2
e3
Antwerp
c4
2
1
1
1
1
6
1
5
6
remote
session
Bern
4
6
1
A.5 Data
On Replicability
45
Background
Age
40
Code
30
a
Introduction
Country of origin
18
12
T1
Participant:
T1
experiment is
to compare
practitioners
Participant:
Introduction analyzing medium to large-scaletool efficiency in supporting
software
software systems.
You will use
The aim of thisCodeCity to analyze Azureus,
experiment is to
a BitTorrent client
compare tool
written in Java.
efficiency in supporting
medium to large-scale
given maximum
software
Introduction
100 minutes for software systems.
solving 10 tasks
YouYou
(10 minutes per
willare
useasked:
CodeCity analyze
task).
Azureus,
The aim of this
experiment to
is to compare
BitTorrentinclient
tool aefficiency
supporting
software
• not to consult
written in
Java.
any other
practitioners
analyzing
medium
You• are
to large-scale
participantsoftware systems.
given maximum
Introduction
to perform
the experiment;
minutes for during
the tasks100
in the
specifiedsolving
• to write down
order; 10 tasks (10 minutes per task).
the
You
will
use
CodeCity
current
toisanalyze
Azureus,
areexperiment
a efficiency
BitTorrent
time
client
asked:
written insoftware
The aim ofYou
thisafter
to compare
tool
in
supporting
Java.
each
time before
completing
all
• not
tasks; software systems.starting to read a task
consult
practitioners
medium
to the
large-scale
•analyzing
totoannounce
any
other
and once
participant during
the
You are
experimente
maximum
100
minutes
• togiven
forr that
the
solving
perform
tasks
experiment;
(10 minutes per task).
the tasks
to
reset your
you10
in the specified
are moving
10-minutesorder;
on
• to
writetodown
another task,
per-task
You will use
CodeCity
to analyze
Azureus,
a BitTorrent
client written into
Java.
• not
the current
allocated
in order
time each time
timer;
You areafter
asked: return to earlier tasks
because it before
• forcompleting
to read a task and
each task,all
affectsstarting
the timing;
to the
fill
once
• not
any100
intasks;
other
participant
the
• totoconsult
during10
announce
thetasks
experiment;
You are
given
maximum
minutes
forrequired
solving
(10 minutes
per task).
check
the
information.
experimenter
the
most
that
the case of multiple
you are movingInon
• to perform
tasks
in appropriate
the specifiedanswer
to requested.
resetthe
order;
your
10-minutes-per
to another task, inchoices
and provide additional
-task
order
allocated
• toasked:
theto
current time each
not down
•write
time
before timer;
to return
starting to read a information,
You are
task and once
earlier tasks because
if
after
completing
all the
affects
tasks; during the itexperiment;
• for
anytask,
other
Theeach
• not to consult
toparticipant
experiment
in the required information. the timing;
isfillconcluded
to announce
the
check
youaare
moving In
onthe
with
to another
the the
tasks
inexperimenter
the
specifiedthat
order;
task, in order
most
• to •perform
case of multiple
short
appropriate
debriefing
answer
choices
to reset
questionnaire
andtimer;
your
provide to
allocated
additional
down
the10-minutes-per-task
current time each time
before
starting
read
a task
and.once
• to write requested.
information,
if
to returnalltothe
• not
earlier
tasks because it affects the timing;
after
completing
tasks;
forThe
each
task,
to fill in the required
experiment
case of multiple
the
experimenter
that youinformation.
are moving In
onthe
to another
task, in order
Thank
• to •announce
choices
is
concluded
you
with and
participating
short
check
the10-minutes-per-task
mostfor
appropriate
debriefing
answer
in
provide
additional
thisa experiment!
information, if
to reset
your
allocated
timer;
questionnaire.
requested.
to
because it affects the timing;
• not to return
earlier
tasks
Richard
the case of multiple choices
Wettel, In
• for each task, to fill in the required information.
Michele
Lanza, Romain
TheThank
experiment
concluded
withand
a short
debriefing
you appropriate
questionnaire.
forisparticipating
check
the
most
answer
provide
additional
information, ifRobbes
in this experiment!
requested.
Richard Wettel, Michele
Lanza, Romain Robbes
The experiment is concluded with a short debriefing
questionnaire.
Thank you for participating in this experiment!
Participant:
practitioners
You are analyzing
Richard Wettel, Michele Lanza, Romain Robbes
Thank you for participating in this experiment!
Richard Wettel, Michele Lanza, Romain Robbes
174
A.4.1 T1: Azureus,
analyzed
A.4 Task Solution
with CodeCity
Oracles
A.2.
The assig
A.5 Data
A1
A.4 Task Solution
Oracles
A.4.1 T1: Azureus,
analyzed with CodeCity
Either
174
A.4 Task Solution Oracles
A1
There are no
unit tests in
the system [1pt],
or
Either
A.4.1 T1:
Azureus,
Centralized
A.4 Task Solution Oracles
analyzed with CodeCity
There are no unit in a single
package hierarchy
tests in the system
[1pt]. Since
whose root is
there is only one [1pt],
A1
or
in org.gudy.azure
test class (i.e.,
answer, the answer
us2.ui.console.
TestUserManage
Centralized
T1: Azureus,
analyzed
with CodeCity
is
multiuser
r), if they don’t
in a single
wrong.
Either
packagecompletely
give the full correct
hierarchy whose
[1pt]. Since there
root is in org.gudy.azureus2.
is only
There are no unitA2.1
one test
tests in the
system
[1pt],
class (i.e., TestUserManager),
ui.console.multiuse
answer, the
answer is completely
r
if they don’t give
or
wrong.
the full correct
Either Centralized
Dispersed
in a single
[0pts
package
hierarchy
whose root is in org.gudy.azureus2.ui.console.multiuser
otherwise]
There are
no unit
tests in
in the following
system [1pt],
A2.1
[1pt].
Since
there isthe
only one test (max.
class (i.e.,
TestUserManager), if they don’t give the full correct
5) packages
or
[0.2pts for each]:
answer, the
answer is completely wrong.
Dispersed
[0pts
• com.aelitis.azur
otherwise]
Centralized in a single package
hierarchy
whose root is in org.gudy.azureus2.ui.console.multiuser
eus.core
in the following
[1pt]. Since there is
only one test(max.
class (i.e.,
TestUserManager), if they don’t give the full correct
5)
packages
[0.2pts for
A2.1
answer, the answer•is completely wrong.
eus.core.conten each]:
com.aelitis.azureus.
t
core
Dispersed [0pts otherwise]
eus.core.downlo
• com.aelitis.azureus.
A2.1 in the following
(max. 5) packages
[0.2pts for each]: ad
core.content
eus.core.impl
Dispersed•[0pts
otherwise]
com.aelitis.azureus.core
core.download
com.aelitis.azur
in the following (max. 5) •packages
[0.2pts eus.core.lws
for each]:
• com.aelitis.azureus.core.content
core.impl
• com.aelitis.azureus.core
eus.core.peerma
• com.aelitis.azureus.core.download
nager.peerdb
core.lws
• com.aelitis.azureus.core.content
eus.core.stats
• com.aelitis.azureus.core.impl
core.peermanager.p
eerdb
• com.aelitis.azureus.core.download
eus.core.torrent
• com.aelitis.azureus.core.lws
core.stats
• com.aelitis.azureus.core.impl
eus.plugins.net.
buddy
• com.aelitis.azureus.core.peermanager.peerdb
core.torrent
• com.aelitis.azureus.core.lws
eus.plugins.net.
• com.aelitis.azureus.core.stats
plugins.net.buddy buddy.swt
• com.aelitis.azureus.core.peermanager.peerdb
eus.plugins.net.
buddy.tracker
• com.aelitis.azureus.core.torrent
plugins.net.buddy.sw
t
• com.aelitis.azureus.core.stats
• com.aelitis.azureus. eus.plugins.removerules
• com.aelitis.azureus.plugins.net.buddy
plugins.net.buddy.tr
acker
• com.aelitis.azureus.core.torrent
• com.aelitis.azureus. eus.plugins.sharing.hoster
• com.aelitis.azureus.plugins.net.buddy.swt
plugins.removerules
• com.aelitis.azureus.plugins.net.buddy
• com.aelitis.azureus. eus.plugins.startstoprules.defau
• com.aelitis.azureus.plugins.net.buddy.tracker
plugins.sharing.host
ltplugin
er
• com.aelitis.azureus.plugins.net.buddy.swt
• com.aelitis.azureus. eus.plugins.tracker.dht
• com.aelitis.azureus.plugins.removerules
plugins.startstoprule
s.defaultplugin
• com.aelitis.azureus.plugins.net.buddy.tracker
• com.aelitis.azureus. eus.plugins.tracker.local
• com.aelitis.azureus.plugins.sharing.hoster
plugins.tracker.dht
• com.aelitis.azureus.plugins.removerules
• com.aelitis.azureus. eus.plugins.tracker.peerauth
• com.aelitis.azureus.plugins.startstoprules.defaultplugin
plugins.tracker.local
• com.aelitis.azureus.plugins.sharing.hoster
• com.aelitis.azureus. eus.ui.swt.content.columns
• com.aelitis.azureus.plugins.tracker.dht
plugins.tracker.peer
auth
• com.aelitis.azureus.plugins.startstoprules.defaultplugin
• com.aelitis.azureus. eus.ui.swt.shells.main
• com.aelitis.azureus.plugins.tracker.local
ui.swt.content.colum
ns
• com.aelitis.azureus.plugins.tracker.dht
• com.aelitis.azureus. eus.util
• com.aelitis.azureus.plugins.tracker.peerauth
ui.swt.shells.main
• com.aelitis.azureus.plugins.tracker.local
•
com.aelitis.azureus.
• com.aelitis.azureus.ui.swt.content.columns
util
• com.aelitis.azureus.plugins.tracker.peerauth
• com.aelitis.azureus.ui.swt.shells.main
• com.aelitis.azureus.ui.swt.content.columns
• com.aelitis.azureus.util
• com.aelitis.azureus.ui.swt.shells.main
174
A.4.1
A1
• com.aelitis.azureus.util
194
ta
CodeCity Experiment
CodeCityTheExperiment
aim of this
Explore
Italy
Belgium
Switzerand
Argentina
Canada
Germany
Romania
T1
Da
industryadvanced
ent
5
academyadvanced
174
A.
academybeginner
EXAMINE VARIABLES=Age BY Syssize BY Tool
/PLOT=BOXPLOT
/STATISTICS=NONE
/NOTOTAL.
7
T1
Compl. Time
Total
(excl. A4.2)
Per Task
B2.1
B1.2
A.5 48.55
Data
Completion Time
B1.1
A4.2
52.97
A4.1
1.02
A3
1.90
34.67
9.85
A2.2
44.67
4.42
A2.1
Code
2.33
A1
7.28
2.92
10.00
37.97
5.25
5.38
47.33
10.00
4.32
1.83
5.33
8.80
2.95
8.58
45.50
IA01
2.55
1.75
55.50
9.37
6.08
1.08
5.62
2.42
1.33
9.43
32.83
IA02
10.00
2.42
42.83
10.00
6.92
2.58
7.92
6.25
2.25
6.08
43.25
IA03
6.92
4.00
53.25
10.00
6.00
1.67
6.25
9.08
2.58
4.75
27.08
IA04
9.42
3.92
37.08
10.00
4.83
2.17
8.08
1.33
1.58
4.25
36.00
AB01
3.75
1.67
46.00
10.00
5.58
1.42
3.50
10.00
1.33
4.75
45.92
AB02
5.17
1.33
55.67
10.00
3.67
1.33
3.17
6.33
3.25
8.75
44.67
IA05
8.33
2.08
54.67
9.75
4.42
1.50
5.00
9.67
1.33
10.00
AA01
36.25
7.33
Correctness
Per Task
Correctness
2.50
10.00
46.25
7.17
Total2.33
8.33
10.00 A4.24.17B1.1
AA02
24.67
A1
A2.1
A2.2
A3
B2.1 1.42
(excl.
A4.2)
4.42 A4.1
34.67
10.00B1.2 2.42
5.92
1.17
4.58
10.00
IA06 1.00
39.50
6.25 1.006.33 0.00
0.00
1.00
0.0010.001.00 3.33
1.00 4.006.00 1.42
6.00
47.33
5.67 1.00
3.75
7.25
2.50
37.58
IA07 0.80
3.67
2.95 1.004.55 0.00
1.00
0.50
1.00
6.30 1.25
6.30
46.08
2.25 1.00
5.250.00 7.831.00
2.67
1.50
10.00
37.83
IA08
7.17
0.00
0.00
1.00
0.00 5.000.00 8.50
1.00
0.00
3.00 8.08 3.00
45.58
3.67 0.003.00 1.00
10.00
1.75
5.00
35.32
IA09 1.00
42.23
0.00
0.00 3.830.00 7.75
1.00 3.08
1.00
5.00 1.75 5.00
2.00 1.009.33 1.00
6.33 0.00
2.83
6.83
30.33
AB03
40.33
0.00
1.003.25 1.00
0.00 4.00
0.00 6.92
1.00 7.42
1.00
5.80 1.25 5.80
3.67 0.807.33 1.00
2.00
7.17
24.75
IA10
34.50
0.00
1.003.83 1.002.40 0.705.921.00
0.00 5.17
0.00 10.00
0.00 5.75
1.00 0.50
4.70 3.83 4.70
5.58
30.50
IA11
2.00
36.83
0.00
1.003.75 1.002.67 0.20 4.171.00 3.67
0.00 5.58
0.00 9.75
1.00
1.00 1.25
5.20 2.58 5.20
39.00
IA12
4.08
49.00
0.00
0.802.671.003.920.40 2.581.0010.00
0.00 3.17
0.00 6.33
1.00
0.00 1.83
4.20 1.67 4.20
35.83
AB04
9.58
3.75
43.67
0.00
1.00 2.501.003.170.00
1.00 5.58
0.00 5.83
1.00 10.00
1.00
1.00 1.25
6.00 2.83 6.00
26.58
AA03
6.00
4.33
36.58
4.670.30
0.00
1.00 6.33
0.00 3.50
0.00 7.83
1.00
1.00 1.67
4.30 2.08 4.30
AB051.00 5.500.00
43.92
4.83
5.17
49.75
1.00AA041.00 7.081.00 3.670.17
1.00 4.75
0.00 3.25
0.00 10.00
1.00
1.00 1.83
6.17 4.17 6.17
46.27
5.33
2.33
56.27
1.00IA13 0.50 3.00
1.00 4.67
0.83
1.00 10.00
0.00 4.50
1.00 5.83
1.00 10.00
1.00
7.33 2.00 7.33
3.85
55.92
2.42
65.92
10.00
9.00
2.67
10.00
6.67
1.00IA141.00
1.00
1.00 2.05
1.00 6.95
0.00
1.00
1.00 10.00
1.00
8.00
8.00
3.67
50.33
54.75
10.00
3.17
1.00AB061.00 5.67
1.00 5.75
1.00 4.67
1.00 10.00
0.00 6.83
1.00
1.00 10.00
1.00
8.00 6.67 8.00
30.83
37.58
4.42
9.33
1.58 6.17
1.58
1.00AB07
1.00 8.75
1.00
0.17 3.75
1.00 8.42
0.00 5.25
0.00
1.00
1.00
6.17
22.83
5.33
6.75
6.08
1.835.0028.00
3.25
3.33
1.00 AA05
1.00 7.00
1.00
0.00 3.67
1.00 6.58
0.16
0.00
0.00
1.00
5.16
52.67
2.75
62.67
5.17
2.00
7.83
2.92
1.75
50.25
1.00 AA06
1.00 6.83
1.00
1.00 2.50
1.00 2.17
0.00
1.00
1.00
1.00
8.0056.75
5.08 8.00
10.00
3.67
4.50
4.33
2.00
39.02
1.00 AA07
1.00 3.67
1.00
0.67 5.08
1.00 10.00
0.00
0.00
1.00
6.6743.32
8.50 6.67
6.50 1.00
8.83
4.00
2.93
2.10
9.50
46.55
1.00 IA15
1.00 9.75
0.80 7.42
0.33 4.33
1.00
0.16
0.00
1.00
6.13 50.53
9.92 6.29
4.30 1.00
3.12
4.03
10.00
4.13
9.20
33.08
1.00 IA16
1.00
1.00 3.90
1.00
1.00
0.00
0.00
1.00
7.00 38.17
9.77 7.00
4.38
3.98 1.00
5.58
4.72
2.55
1.92
9.82
44.42
6.58 6.50
1.00 IA01
1.00
1.00 5.13
0.50
1.00
0.00
0.00
1.00
6.50 50.33
4.58
5.08 1.00
1.92
3.83
5.28
3.58
3.50
33.83
9.42 6.83
1.00 IA18
1.00
1.00 3.83
0.83
1.00
1.00
0.00
0.00
5.83 39.83
4.50
5.92 1.00
1.50
3.50
3.33
1.92
8.83 0.00
38.00
2.00
10.00
0.00 IA19
0.80
0.33
0.30
1.00
0.67
1.00
1.00
5.10
4.43
41.33
6.00
1.08
2.33
3.42
6.08
1.17
10.00
31.92
4.83 4.00
4.33
1.00 AB08
1.00
1.00
0.00
0.00
0.00
0.00
4.00 37.08
3.33 0.00
4.83
2.92
8.00 1.00
5.83
1.67
7.33
34.25
3.67 2.27
5.08 0.00
5.17 1.00
0.00 AB09
0.60
0.67
0.00
0.00
0.00
2.27 37.58
6.08
3.00
3.50 0.00
3.17
1.42
3.83
53.08
6.08 1.00
3.33 1.004.42 7.00
1.00 AA10
1.00
1.00
0.00
1.00
0.00
1.00
6.00 58.75
4.08
5.42
5.25
6.17
1.42
4.75
40.08
AA11
3.92 1.00
5.67 1.006.33 4.00
4.75 1.00
0.00
1.00
0.00
0.00
3.00 43.33
4.50
5.92 0.00
6.75 0.00
2.42
10.00
32.67
AA12
9.17
10.00 1.00
7.00 1.00
1.00
1.00
0.00
0.00
5.001.42
4.00 36.08
1.33
2.83 1.003.25 0.00
7.00 0.00
10.00
AA13
2.42
3.33 1.00
1.50 1.00
1.00
0.80
0.00
4.80
3.80
2.33 1.003.42 0.00
6.33 0.00
8.00 0.00
AA14
4.75
5.58 1.00
0.00
0.00
0.67
0.67
0.00
1.00
1.00
4.34
3.67
6.83 0.00
IA20
time, in minutes
1.00
1.00
1.00
0.50
1.00
1.00 subjects’
0.00
1.00
1.00
7.50
6.50
task completion
The
A.4. 1.00
1.00
1.00
1.00
0.50Table
1.00
0.00
1.00
1.00
7.50
6.50
0.00
1.00
0.60
0.00
1.00
1.00
0.00
1.00
0.00
4.60
3.60
0.00
1.00
0.40
0.38
1.00
1.00
1.00
1.00
1.00
6.78
5.78
1.00
1.00
0.60
0.50
1.00
1.00
0.00
1.00
1.00
7.10
6.10
1.00
1.00
0.80
0.38
1.00
1.00
0.00
1.00
1.00
7.18
6.18
1.00
1.00
0.80
0.00
1.00
1.00
0.00
1.00
1.00
6.80
5.80
0.00
0.50
0.00
0.00
1.00
1.00
0.00
1.00
1.00
4.50
3.50
0.00
1.00
0.00
0.25
1.00
1.00
0.00
1.00
1.00
5.25
4.25
1.00
1.00
0.60
0.00
1.00
1.00
0.90
1.00
1.00
7.50
6.50
0.00
1.00
1.00
0.38
1.00
1.00
0.00
1.00
1.00
6.38
5.38
196
ent
CodeCity Experim
Participant:
Table
195
CodeCity Experim
20
Block
developer
software engineer
system analyst
software architect
project manager
IT lead
CTO
consultant
professor
Ph.D. student
Master student
Dat
20
A.5
41 Subjects
21
25
industry
academia
IA01
A1
IA02
diffic
IA03
ult
IA04
diffic
AB01
A2.1
simp ult
AB02
inter
trivia le
IA05
medi
trivia l
AA01
ate
inter
A2.2
simp l
AA02
simp medi
inter
inter le
IA06
ate
trivia le
medi
trivia medi
IA07
trivia l
ate
inter
ate
inter l
IA08
simp l
simp medi
diffic medi
IA09
trivia le
A3
simp
35
ate
trivia le
Cod
trivia ult ate
AB03
le
trivia l
trivia l
Diffi
diffic l
IA10
e
diffic
IA0
simp l
A4.1 culty
simp l
simp ult
IA11
simp ult
IA0 1
inter le
simp
Age
simp le
Leve
simp le
IA12
IA0 2
simp le
trivia medi
le
trivia le
l Per
IA0 3
simp le
AB04
30
trivia le
ate
simp l
inter
simp l
A4.2 Task
AB0 4
34
simp le
AA03
Job
simp l
simp le
inter medi
AB0 1
inter le
42
impo
trivia le
AB05
inter le
Pos
simp le
Dev
simp medi ate
IA0 2
37
simp medi
ssible
trivia l
AA04
simp medi
Dev elop itio
simp le
ate
simp le
AA0 5
21
ate
simp le
diffic
simp l
IA13
ate
simp le
B1.1
CTO elop
simp le
AA0 1
n
inter le
21
simp le
er
impo ult
inter le
IA14
simp le
Dev ,
diffic
IA0 2
trivia le
29
trivia medi
simp le
inter ssible
trivia medi
Ma elop Dev er
AB06
inter le
IA0 6
ult
26
trivia l
ate
simp l
simp le
diffic medi
ate
Ma ster
elop
inter l
AB07
IA0 7
inter medi
26
simp
simp l
simp le
B1.2
simp le
Con ster Stuer
impo ult ate
IA0 8
simp medi
AA05
35
simp medi ate
diffic le
inter le
simp le
trivia
Ph. sult Stu den er
trivia le
impo ssible
AB0 9
ate
inter le
AA06
27
ate
simp le
diffic ult
simp medi
inter le
Ph. D.
l
trivia l
IA1 3
impo ssible
25
simp medi
AA07
diffic le
impo ult
ate
inter le
Hea D. Stu ant den t
inter medi
simp
IA1 0
simp l
inter ssible
32
B2.1
ate
inter le
IA15
inter ult
diffic ssible
simp medi
Sof d Stu den , Ph. t
simp medi ate
IA1 1
simp le
diffic le
28
diffic medi
trivia medi
trivia
IA16
simp medi
impo ult
ate
Sof twa of den t D.
simp le
ate
AB0 2
diffic le
simp le
39
simp ult
impo ult ate
ate
diffic l
l
ate
diffic le
Stu IA01
diffic ssible
Dev twa re IT
AA0 4
simp le
t
inter ult
38
simp le
inter le
diffic ssible
OO
inter ult
simp
denIA18
inter ult
Stu elop re Eng
inter ult
AB0 3
B2.2
simp le
34
inter medi
diffic le
simp medi
kno P le
inter ult
trivia le
Pro den
inter medi
IA19
diffic medi
Eng ine
t
AA0 5
simp medi
22
inter
inter
inter medi ate
trivia ult
ate
simp le
impo medi
Con ject t me
wle medi
simp l
inter medi ate advinter
IA1 4
AB08
simp ult ate
ate
22
diffic le
medi
nt ine er
trivia medi ate
simp l
Time
simp le
Sen sult Ma
anc dge
diffic ssible ate
trivia le
IA1 3
simp medi ate exp inter
AB09
22
inter le
inter ult
Lea er
ate
impo
ate
inter l
simp le
ert edmediablate
Ma ior
simp le
diffic ult
AB0 4
Pres
trivia l
29
ateadvadv inter
inter le
AA10
inter medi
inter medi
der
medi ate
diffic ssible
inter medi
Ma ster Javant nag
anc
simp le
inter le
AB0 6
fair
sure
diffic ult
32
e
simp l
trivia medi adv ancsimp
AA11
impo medi ate
, Res
inter medi ate
inter ult
inter medi ate
Jav medi
Ma ster Stu a , Sys er
ed medi ate
inter le
AA0 7
inter
fair amou
31
impo ult
trivia le
ate
simp l
inter ssible ate
exp ancsimp
diffic medi ate
simp medi
ate
Ph. ster Stu den Arc tem
adv medi
a
ed le
ear AA12
inter medi ate
AA0 5
simp medi
ate
23
inter
fair amou nt
impo ssible
Exp
simp l
diffic le
AA13
exp ert inter
diffic medi
trivia ult ate
Con D.
ate
hite
adv
simp le
le
ed
ate
che
AA0 6
medi
anc
inter
ate
23
simp le
Stu
medi
ate
Ma
den t
too amou nt
inter
impo ssible
simp ult exp ert simp medi
inter le
AA14r
erietrivia ult ate
kno
Sof sult Stu den
trivia l
trivia le
IA1 7
anc
ct nag
30
trivia medi ate
inter le
mediedate
very much nt
t
diffic ssible
simp le exp ert inter le
simp medi
IA20
advinter
Ma twa ant den
diffic
diffic l
IA1 5
nce l
wle ed ate
inter l
ate
26
ate
inter l
simp medi
er/
fair little
inter ult
ate
advinter
Ma ster re
trivia le kno ert simp medi
trivia le
impo ult
anc medi
t t
IA0 6
diffic ult
30
dgeate
diffic medi
trivia medi
Lev
ate
Ana
simp le
not amou
simp
Ph. ster Stu Arc
diffic medi
inter l kno wle simp le
simp l
anc medi
IA1 1
ssible
ed abl
diffic Ecl
40
inter ult
ate begadvsimp
inter ult ate
ate
trivia l
el
trivia le
adv wle dge
lyst
ate
le
very so muchnt
Ph. D.
diffic ult ate
anc ed
medi
IA1 7
inter le
39
Stu den hite
ult ips
inter medi
e diffickno
le
inter medi
simp l
kno ate
trivia l
exp inter
Ph. D. Stu
anc dge inter
le
not little
inn ed
AB0 9
inter ult
simp medi
abl medi
30
ct
simpknoult wle e
inter medi ate
diffic medi ate
exp wle
simp le
adv ert
Pro D. Stu den den t
inter l
ed trivia
fair so much
AB0 8
interermedi
inter medi
abl e
27
ate
simp le
interbegle wle dge trivia le
medi ate
t
impo
kno ert dge simp
ateexp anc
inter ult ate
l
Stu
Sof
den
medi
ject
inter
AA1 9
kno
t
medi
ate
simp
e
inn dge abl
39
not amou
simp medi ate
simp le
medi
impo
exp wle
simp ssible ate
le
diffic medi
kno ertintered le
Dev twa Ma den t
simp l
AA1 0
adv wle
inter medi ate
abl
21
ate
very so muchnt
ate
simp le
trivia le
e le
ate abl trivia
adv ert dge
inter anc
ssible er
diffic le
adv wle
Sof elop re nag t
diffic ult ate
AA1 1
adv
e
Rev
23
inter medi ate
medi
not little
simp le
e
trivia l
adv anc
interancmedi dge
beg anc trivia
Con twa
Arc
simp ult
AA1 2
beg
diffic ult
abl
24
dge l
ed abl trivia l beg .En
inter medi ate
ate
simp
very so much
adv anc ed
simp le
diffic l
kno inn ed
Ma sult re er
simpinnmedi
hite er
AA1 3
kno
inter le
e
diffic ult
23
ed ate e simp l kno inn g. simp le
abl
inter medi ate
le
adv anc ed
not little
inter ult
adv wle er
Ph. ster ant Eng
IA2 4
knodiffic
inter medi
52
wle le er ate
ct
simp ult
e
trivia lekno wle er simp le
inter medi ate
exp anc ed
fair so much
beg anc dge
Ph. D.
adv impo
inter medi
0
28
ine
wle ult
inter medi ate
Stu
dge
simp le
wle dge
simp beg
l
adv ert ed
simp medi ate
exp inn ed abl
Ph. D. Stu den, Pro er
none amou
trivia le
beg inter
inter medi ate
24
anc ssible
beg
dge abl
inn dge abl
impo medi ate
le
impo
Tab
le
simp
adv
ate
le
trivia
Stu
adv
Pro
anc
D.
ject
den
nt
very
ert er
simp l
36
inter medi ate
medi abl e simp beg
e le kno inn ed
inn er
OO
diffic ssible ate
diffic ssible
adv anc ed
abl e le
adv anc
Ph. fess Stu den t t
inter l
non
not little
kno leinn er
trivia
Ma
inter medi ate
ate e
A.5wle er
7–1 P simp ult
adv anc ed
e
inter ult
adv anc ed
Ma D. or den t
trivia
simp medi
beg e
not so much
beg lewle er
nag
simp l
dge
inter medi ate
7–1 0 diffic le
exp anc ed
adv anc ed
Dev ster Stu
Nu
inter medi
trivia
kno inn . The
l
ate
t
trivia le
kno inn
fair so much
er,
inter le >1 0
dge
simp medi ate
abl
adv ert ed
kno anc ed
interJavult mb
diffic medi ate
beg wle er
elop Stu den
l er
inter l
none amou
Arc
7–1 a
esub adv wle
ate
exp anc
impo le
abl simp medi7–1 0
kno wle ed
er simp medi
simp medi
adv inn dge
inter ult ate
kno anc dge
er den t
nt
hite
fair
ject
adv ert ed
etrivia le 4–6ate0
inter ssible
adv wle dge
adv anc er
simp4–6le 0 ateEcl oftrivia le
diffic medi
beg wle ed abl trivia
ate
t
abl
fair amou
kno anc
Yea
ct
l 1–3
1–3le
adv anc dge abl
simp medi
adv anc ed
simp
adv inns’ dge
ips
inter ult ate
7–1
7–1
per
e
amou
inter
l
kno wle ed
e
nt
too
adv anc ed abl e
rs
l
ate
kno anc ed
simp le
beg anc er
le
simp
e medi
impo medi
4–6
>1 0
Ta
exp wle dge
simp
ablceiv
4–6 0
kno anc ed
fair much nt
kno wle ed
inter le
kno inn ed
e
diffic le
diffic ssible ate
1–3inter le
adv ert dge abl
ble
e ed 4–6 0
ate
1–3
adv wle ed
kno wle dge
very amou
adv wle er
simp medi
7–1trivia ult
1–3simp mediRev
>1tim
adv anc
diffic ult
exp anc dge
kno wle dge
none little nt
.En
abl e
adv anc dge
ate
simp le
A.
7–1 0 e
4–6
>1 0 l
adv anc ed
diffic ult
adv ert ed abl
ate g. inter le
adv wle dge ablable
e
kno anc ed abl
fair
pre
1.
4–6 simp le
adv anc ed
1–3
1–3 0
adv anc
4–6 0
impo ult
adv anc dge
beg wle ed
fair amou
e
7–1
1–3 trivia le
>1 ssu
inter medi
exp anc ed
e
4–6
Th
adv anc ed
adv anc ed ablable e
inter ssible
beg inn dge
7–1 0
not amou nt
re 4–6 trivia l >1
adv ert ed
inter medi ate
exp anc ed
4–6 0
e
adv anc ed
diffic medi
beg inn er
and
l
1–3
e
fair so muchnt
exp anc
>1
abl
1–3 0 simp medi ate
4–6 0
adv ert ed
su
kno anc ed
exp inn er
impo ult ate
7–1
7–1 0
1–3 task
adv ert ed
e
fair amou
adv anc
ate
bje
<1
diffic le
adv wle ed
beg ert er
impo ssible
4–6
>1 0
anc
1–3 0
not amou nt
adv anc ed
<1
kno anc dge
trivia ult
beg inn
diffi
cts
7–1
diffic ssible
1–3
4–6
adv anc ed
4–6 0
ed
fair so muchnt
kno wle ed abl
l
beg inn er
7–1 0
7–1
1–3
inter ult
’ pe
adv anc ed
<1 cul
very amou
kno wle dge
kno inn er
e
ty
>1 0
impo medi
1–3
<1
adv anc ed
4–6 0
kno wle dge
not little nt
exp wle er
7–1
rso
1–3
1–3 0
diffic ssible ate
exp anc ed
4–6
adv wle dge ablable
7–1
kno ert dge
not so much
7–1
4–6 0
na
4–6
exp ert ed
ult
adv anc dge
adv wle
not so much
7–1 0
1–3 0
abl
1–3
l inf
<1
ert
adv anc ed ablable e
kno anc dge
>1
not so much
e
4–6
4–6
4–6 0
adv anc ed
beg wle ed abl
1–3 0
e
not so much
or
1–3
4–6
4–6
kno anc ed
beg inn dge
7–1
7–1
so
4–6
ma
e
4–6
kno wle ed
much
adv inn er
7–1
7–1 0
1–3
abl
1–3 0
exp wle dge
tio
beg anc er
7–1 0
1–3
>1 0
e
1–3
adv ert dge abl
kno inn ed
n,
7–1
1–3
>1 0
<1 0
anc
adv wle er
7–1 0
abl e
1–3
clu
4–6 0
<1
kno anc dge
ed
7–1 0
7–1
e
1–3
1–3
ste
adv wle ed abl
1–3
>1 0
4–6 0
7–1
kno anc dge
7–1 <1
red
>1
e
4–6 0
kno wle ed abl
7–1
4–6 0
1–3 0
1–3 0
by
beg wle dge
1–3
e
4–6 0
4–6
<1
7–1
inn dge abl
7–1
tre
4–6
<1
er
4–6 0
abl e
4–6
atm
1–3 0
4–6
4–6
>1
e
4–6
1–3
en
4–6
1–3
4–6
4–6 0
7–1
t co
4–6
4–6
4–6
1–3 0
>1
>1
1–3
mb
1–3
0
1–3
4–6 0
4–6
4–6
ina
<1
7–1
4–6
4–6
tio
4–6
0
<1
4–6
ns
1–3
4–6
1–3
7–1
1–3
0
>1
1–3 0
1–3
1–3
50
Code
IA01
IA02
IA03
IA04
AB01
AB02
IA05
AA01
AA02
IA06
IA07
IA08
IA09
AB03
IA10
IA11
IA12
AB04
AA03
AB05
AA04
IA13
IA14
AB06
AB07
AA05
AA06
AA07
IA15
IA16
IA01
IA18
IA19
AB08
AB09
AA10
AA11
AA12
AA13
AA14
IA20
2
55
ia
ing Criter ience
Block
Exper
ced
round
advan
Backg
ced
try
advan
ment
m size
indus
ced
Treat
Syste
try
advan
indus
ced
large
Tool
try
advan
er
ity
indus
ner
large
Numb
CodeC
try
begin
Code
1
ity
indus
ner
large
CodeC
my
begin
1
ity
acade
ced
IA01
large
CodeC
my
advan
1
ity
acade
ced
IA02
large
CodeC
try
advan
1
ity
indus
ced
IA03
large
CodeC
my
advan
1
ity
acade
ced
IA04
large
CodeC
my
advan
1
ity
acade
ced
AB01
large
CodeC
try
advan
1
ity
indus
ced
AB02
large
CodeC
try
advan
1
ity
indus
ced
IA05
large
CodeC
try
advan
1
ity
m
indus
AA01
ner
mediu
CodeC
try
begin
1
m
ity
indus
ced
AA02
mediu
CodeC
my
advan
2
m
ity
acade
ced
IA06
mediu
CodeC
try
advan
2
m
ity
indus
ced
IA07
mediu
CodeC
try
advan
2
m
ity
indus
ner
IA08
mediu
CodeC
try
begin
2
m
ity
indus
ced
IA09
mediu
CodeC
my
advan
2
m
ity
acade
ner
AB03
mediu
CodeC
my
begin
2
m
ity
acade
ced
IA10
mediu
CodeC
my
advan
2
m
ity
acade
ced
IA11
mediu
CodeC
my
advan
2
m
ity
acade
ced
IA12
mediu
CodeC
try
advan
2
m
ity
indus
AB04
ner
mediu
CodeC
try
begin
2
m
ity
indus
AA03
ner
mediu
CodeC
my
begin
2
ity
acade
ced
AB05
large
CodeC
my
advan
2
xcl
acade
ced
AA04
large
Ecl+E
my
advan
3
xcl
acade
ced
IA13
large
Ecl+E
my
advan
3
xcl
acade
ced
IA14
large
Ecl+E
my
advan
3
xcl
acade
ced
AB06
large
Ecl+E
try
advan
3
xcl
indus
ced
AB07
large
Ecl+E
try
advan
3
xcl
indus
ced
AA05
large
Ecl+E
try
advan
3
xcl
indus
ced
AA06
large
Ecl+E
try
advan
3
m
xcl
indus
AA07
ner
mediu
Ecl+E
try
begin
3
m
xcl
indus
IA15
ner
mediu
Ecl+E
my
begin
4
m
xcl
acade
ced
IA16
mediu
Ecl+E
my
advan
4
m
xcl
acade
ced
IA01
mediu
Ecl+E
my
advan
4
m
xcl
acade
ced
IA18
mediu
Ecl+E
my
advan
4
m
xcl
acade
ced
IA19
mediu
Ecl+E
my
advan
4
m
xcl
acade
ced
AB08
mediu
Ecl+E
my
advan
4
m
xcl
acade
ced
AB09
mediu
Ecl+E
my
advan
4
m
xcl
acade
AA10
mediu
Ecl+E
try
4
m
xcl
indus
AA11
blocks
mediu
Ecl+E
4
m
xcl
ts and
AA12
mediu
Ecl+E
4
xcl
treatmen
AA13
Ecl+E
cts to
4
AA14
the subje
of
IA20
nment
19
Subjects
193
Table A.3. The correctness of the subjects’ solutions to the tasks
Pre-experiment
questionnaire
Experiment
handouts
Task solutions &
grading schemes
Data set
Page 22
Correctness
Results
24 %
.26
more correct with CodeCity
Completion Time
CodeCity
CodeCity
Eclipse+Excel
Eclipse+Excel
12 %
8
7
.01
6
60
50
faster with CodeCity
5
40
4
30
ANOVA (two-way analysis of variance)
statistically significant
95% confidence interval
large effect size (d=0.89)
CodeCity
vs
Eclipse
!
+24% correctness
-12% completion time
3
2
medium
large
Statistically
Significant!!
ANOVA (two-way analysis of variance)
statistically significant
95% confidence interval
moderate effect size (d=0.63)
20
medium
large
Statistically
Significant..
Statistically
Significant
(who cares?)
Richard Wettel, Michele Lanza,
Romain Robbes
Software Systems as Cities: A Controlled Experiment
Richard Wettel and Michele Lanza
ABSTRACT
Software Systems as Cities:
A Controlled Experiment
Software visualization is a popular program comprehension technique used in the context of software maintenance, reverse engineering, and software evolution analysis. While there is a broad range
of software visualization approaches, only few have been empirically evaluated. This is detrimental to the acceptance of software
visualization in both the academic and the industrial world.
We present a controlled experiment for the empirical evaluation
of a 3D software visualization approach based on a city metaphor
and implemented in a tool called CodeCity. The goal is to provide
experimental evidence of the viability of our approach in the context
of program comprehension by having subjects perform tasks related
to program comprehension. We designed our experiment based on
lessons extracted from the current body of research. We conducted
the experiment in four locations across three countries, involving
41 participants from both academia and industry. The experiment
shows that CodeCity leads to a statistically significant increase in
terms of task correctness and decrease in task completion time. We
detail the experiment we performed, discuss its results and reflect
on the many lessons learned.
!
!
Categories and Subject Descriptors
D.2.7 [Distribution, Maintenance and Enhancement]: Restructuring, reverse engineering, and reengineering
In Proceedings of ICSE 2011
(33rd International Conference on Software Engineering)
pp. 551 - 560. IEEE CS Press, 2011
General Terms
Experimentation, Human Factors, Measurement
Keywords
1.
INTRODUCTION
Software visualization is defined by Stasko et al. as “The use
of the crafts of typography, graphic design, animation, and cinematography with modern human-computer interaction and computer graphics technology to facilitate both the human understanding and effective use of computer software” [13].
Richard Wettel and Michele Lanza
Categories and Subject Descriptors
D.2.7 [Distribution, Maintenance and Enhancement]: Restructuring, reverse engineering, and reengineering
General Terms
Experimentation, Human Factors, Measurement
Keywords
1.
1. we detail the experiment design and its operation, reporting
on a number of lessons learned regarding the many pitfalls
that this type of experiment entails, and
INTRODUCTION
Permission to make digital or hard copies of all or part of this work for
personal or classroom use is granted without fee provided that copies are
not made or distributed for profit or commercial advantage and that copies
bear this notice and the full citation on the first page. To copy otherwise, to
republish, to post on servers or to redistribute to lists, requires prior specific
permission and/or a fee.
ICSE ’11, May 21–28 2011, Honolulu, Hawaii
Copyright 2011 ACM 978-1-4503-0445-0/11/05 ...$10.00.
3.
PLEIAD @ DCC
University of Chile
rrobbes@dcc.uchile.cl
It has earned a reputation as an effective program comprehension
technique, and is widely used in the context of software maintenance,
reverse engineering and reengineering [2].
The last decade has witnessed a proliferation of software visualization approaches, aimed at supporting a broad range of software
engineering activities. As a consequence, there is a growing need for
objective assessments of visualization approaches to demonstrate
their effectiveness. Unfortunately, only few software visualization
approaches have been empirically validated so far, which is detrimental to the development of the field [2].
One of the reasons behind the shortage of empirical evaluation lies
in the difficulty of performing a controlled experiment for software
visualization. On the one hand, the variety of software visualization
approaches makes it almost impossible to reuse the design of an
experiment from the current body of research. On the other hand, organizing and conducting a proper controlled experiment is, in itself,
a difficult endeavor, which can fail in many ways: data which does
not support a true hypothesis, conclusions which are not statistically
significant due to insufficient data points, etc.
of a software visualization approach based on a city metaphor [16].
The aim is to show that our approach, implemented in a tool called
CodeCity, is at least as effective and efficient as the state of the
practice at supporting reverse engineering and program comprehension activities. We conceived a set of tasks and measured both the
correctness of the task solutions and the task completion time. We
conducted the experiment in four locations across three countries,
with participants from both industry and academia, with a broad
range of experience.
In this paper we make the following two major contributions:
Software visualization, Empirical validation
Software visualization is defined by Stasko et al. as “The use
of the crafts of typography, graphic design, animation, and cinematography with modern human-computer interaction and computer graphics technology to facilitate both the human understanding and effective use of computer software” [13].
CODECITY IN A NUTSHELL
Our approach [16, 15, 19] uses a city metaphor to depict software
systems as three-dimensional cities (see Figure 1), where classes
are buildings and packages are districts.
Romain Robbes
REVEAL @ Faculty of Informatics
University of Lugano
{richard.wettel,michele.lanza}@usi.ch
2. we discuss the results of the experiment, which show that
our approach is a viable alternative to existing non-visual
techniques.
Structure of the paper. In Section 2 we describe our software
visualization approach. In Section 3 we present the related work,
followed by a list of desiderata in Section 4, extracted from an
extensive study of the existing body of research. In Section 5 we
describe the design of our experiment and in Section 6 we detail the
experiment’s operational phase. In Section 7 we present how we
collected the data on which we performed the analysis presented
in Section 8. In Section 9 we present the results of the experiment,
followed by a presentation of the threats to validity in Section 10,
and the conclusions in Section 11.
Figure 1: Representation of a software system in CodeCity
Similar to the 2D polymetric view approach [4], we map a set of
software metrics on the visual properties of artifacts: The Number
Of Methods is mapped on the height of the buildings, the Number Of
Attributes on the base size, and the number of Lines Of Code on the
color of the buildings, from dark gray (low) to intense blue (high).
We also extended our approach to address the visualization of
design problems [19]: Inspired by disease maps we assign vivid
colors to design problems and color the affected artifacts according
to the design problems that characterize them. The resulting visualization is called disharmony map (see Figure 2) and relies on design
problem data computed using Marinescu’s detection strategies [7].
Component
java.awt
String
Calendar
NOM: 280
NOA: 88
java.lang
NOM: 81
NOA: 7
RELATED WORK
4.
There is a rich body of research on empirical evaluation of information visualization by means of controlled experiments. To
identify both good practices and commonly occurring mistakes, we
conducted an extensive study of the literature. The study [20] extends far beyond the scope of this paper; we limit the discussion to
controlled experiments for the evaluation of software visualization.
Storey and Müller performed a controlled experiment [14] to
compare the support of SHriMP and Rigi in solving a number of
program comprehension tasks, both to each other and to a baseline
(i.e., SNIFF+).
Marcus et al. performed a study to test the support provided
by the sv3D visualization tool in answering a set of program comprehension questions [6]. The authors compared the performances
obtained by using sv3D to the ones obtained by exploring the source
code in an IDE and a text file containing metrics values.
Lange et al. performed a controlled experiment in which they
evaluated the usefulness of their enriched UML views, by comparing them with traditional UML diagrams [3]. The baseline of the
experiment was composed of a UML tool and a metric analysis tool.
Quante performed a controlled experiment for the evaluation of
dynamic object process graphs in supporting program understanding
[10], using not one, but two subject systems. A statistically significant improvement of the experimental group could only be detected
in the case of one of the systems, which shows that relying on solely
one system is unsound.
Cornelissen et al. [1] performed a controlled experiment for the
evaluation of EXTRAVIS, an execution trace visualization tool. The
purpose of the experiment was to evaluate how the availability of
EXTRAVIS influences the correctness and the time spent by the
participants in solving a number of program comprehension tasks.
From the perspective of the experiment’s design, there are two
major differences between our experiment and the related work,
summarized in Table 1.
Subjects
Academia
Industry
Storey et al. [14]
30
Marcus et al. [6]
24
Lange et al. [3]
100
Quante [10]
InputEvent
java.awt.event
NOM: 14
NOA: 21
0
0
0
25
0
http://codecity.inf.usi.ch
17
27
38
39
475
1,725
2
8. Choose real-world systems. Many experiments in the literature use small systems, making it hard to generalize the
results for real-world systems.
42
?
?
20
1
310
57
21
20
1,320
4,656
93
454
Q4 : Do the potential benefits of using CodeCity in terms of correctness and time depend on the user’s background (i.e., academic versus industry practitioner)?
7. Limit the time allowed for solving each task. Allowing
unbounded time for a task to avoid time pressure may lead
participants to spend the entire time allotted for the experiment on solving a single task, as Quante’s experiment [10]
showed.
43
160
Cornelissen et al. [1]
9. Include more than one subject system in the experimental design. Quante [10] showed that performing the same
experiment on two different systems can lead to significantly
different results.
10. Provide the same data to all participants. The observed
effect of the experiment is more likely attributable to the
independent variables if this guideline is followed.
Table 1: Comparing to the related work
5.2.1
Q3 : Which are the task types for which using CodeCity over nonvisual exploration tools makes a difference in either correctness or completion time?
6. Include tasks which may not advantage the tool being
evaluated. This allows the experimenters to actually learn
something during the experiment, including shortcomings of
their approach.
11. Report results on individual tasks. This allows for a more
precise and in-depth analysis of the strengths and weaknesses
of an approach.
The first is that in our experiment, we have an equal split between
subjects from academia (i.e., 21) and industry (i.e., 20), while the
rest of the experiments have only subjects from academia, with
the exception of Cornelissen et al., who had one single industry
practitioner. While students are convenient subjects, a sample made
up entirely of students might not adequately represent the intended
user population [9].
The second issue regards the fact that the systems used by most
of the experiments are rather small and, thus, not representative for
a realistic work setting. In our experiment the largest of the two
systems has almost half a million lines of code. Moreover, learning
from Quante’s experiment we do not rely on one system only, but
on two.
Research Questions & Hypotheses
The research questions underlying our experiment are:
Q5 : Do the potential benefits of using CodeCity in terms of correctness and time depend on the user’s experience level (i.e.,
novice versus advanced)?
The null hypotheses and alternative hypotheses corresponding to
the first two research questions are synthesized in Table 2.
12. Provide all the details needed to make the experiment
replicable. Di Penta et al. discuss the benefits of replicability
and present a set of guidelines that enable the replication of
the experiment [9].
5.
EXPERIMENTAL DESIGN
The purpose of our experiment is to quantitatively evaluate the
effectiveness and efficiency of our approach when compared to
state-of-the-practice exploration approaches.
Null hypothesis
Alternative hypothesis
H10 : The tool does not impact
the correctness of the solutions to program comprehension tasks.
H1 : The tool impacts the correctness of the solutions to program comprehension tasks.
H20 : The tool does not impact the
time required to complete program comprehension tasks.
H2 : The tool impacts the time required to complete program
comprehension tasks.
Table 2: Null and alternative hypotheses
For the third question, we perform a separate analysis of correctness and completion time for each of the tasks. For the last two
questions we analyze the data within blocks, which we however
do not discuss in this paper due to space reasons. We refer the
interested reader to our detailed technical report [20].
5.2
5.2.2
7.4
Participants’ Feedback
The questionnaire handout ends with a debriefing section, in
which the participants are asked to assess the level of difficulty for
each task and the overall time pressure, to give us feedback which
could potentially help us improve the experiment, and optionally, to
share with us any interesting insights they encountered during the
analysis.
System size
Medium
Large
Name
Lines of code
Packages
Classes
God classes
Brain classes
Data classes
A4.1
1
11
5
4
0
A4.2
2
7
11
1
0
B1.1
B1.2
0
0
3
8
10
2
2
6
9
2
B2.1
4
11
5
1
0
B2.2
8
9
3
1
0
1
4
6
8
2
Dep. var.
System size
Tool
Medium
Ecl+Exl
CodeCity
mean
difference
min
max
median
stdev
3.50
6.50
5.800
1.147
5.462
Difficulty
A1
A2.1
A2.2
tool = Ecl+Excl 7
trivial
7
5
simple
7
10
12
intermediate
3
4
3
difficult
4
0
1
impossible
0 A2.1
0 A2.2
0
Difficulty
A1
trivial
7
7
5
simple
7
10
12
intermediate
3
4
3
tool = CodeCity
difficult
4
0
1
impossible
0
0
0
Ecl&Exl
trivial
3
0
A3
simple
12
9
6
3
8
2
0
8
0
0
A1
A2.1
tool = Ecl+Excl
trivial
Legend:
12
9 tool = Ecl+Excl
1
11
5
4
0
A3
simple
A4.1
A321
A4.2
9
A4.1
7
7
5
4
A3
3
0
0
A4.1
B1.2
0
B2.2
8
9
1
4
6
8
2
1
4
6
2
B2.2
B2.161
8
0
0
B1.2
difficult
1
0 B2.2
impossible8
B2.1
B2.2
impossible
B2.1
B1.265
5
4
2
B1.1
B2.1
4
11
5
1
0 B2.1
8
9
3
1
0
3
impossible
4
11
5
1
difficult
B1.2
1
B1.1
7
4
2
0
A4.2
9
2 B1.2
B1.2
difficult
B1.1
4
A4.2
9
Task
Task
intermediate
simple
2
2
6
difficult
intermediate
A4.1
A4.2
B1.1
simple
intermediate
Task
A3
10
0
0
A2.2
A4.1
A4.2
B1.1
1
2
0
11
7
0
5
11
3
4
1
8
0 A4.1
0 A4.2
10 B1.1
2
0
2
7
0
2
11
3
6
1
8
9
0
10
2
intermediate
A3
trivial
0 12
A2.2
= CodeCity A2.1
9tool A1
trivial
CodeCity
6
12 3
9 0 A1
A2.1
A2.2
1
1
A163
A2.1
A2.2
6
10
8
Difficulty
trivial
simple
intermediate
difficult
impossible
B2.200
10
1
1
B2.1
6
8
5
B2.2
impossible
6
simple
difficult
impossible
Figure
4:trivial
Histograms
of intermediate
perceived difficulty
per task
12 3
9 0
A1
A2.1
A3
A4.1
A4.2
B1.1
B1.2
B2.1
B2.2
6
The reason
is that
weA2.2
underestimated
the knowledge
of CodeCity
3
Task
required0 to perform
this
task.
Solving
this task
with
our
toolB2.2implies
A1
A2.1
A2.2
A3
A4.1
A4.2
B1.1
B1.2
B2.1
a deep knowledge of CodeCity’s customization
features, and of the
Task
underlying Smalltalk programming language: the only subject who
managed to solve the task in the experimental group is a Smalltalk
expert. These were unreasonable requirements to expect from the
experimental subjects, who were not trained to use such features. To
eliminate this unusually large discrepancy between the two groups,
we excluded the task from the analysis.
Difficulty
trivial
simple
intermediate
difficult
impossible
Difficulty
trivial
simple
intermediate
difficult
impossible
8.2
A1
A1
3
6
8
2
0
A2.1
A2.2
3
1
1
6
10
8
8
8
10
2
0
0
0 A2.1
0 A2.2
0
1
1
10
8
8
10
0
0
0
0
A3
A3
1
2
7
5
4
A4.1
1
2
7
5
4 A4.1
9
7
3
0
0
A4.2
9
7
3
0
0 A4.2
4
9
4
2
0
B1.1
4
9
4
2
0 B1.1
1
7
5
4
2
B1.2
1
7
5
4
2 B1.2
5
6
8
0
0
B2.1
B2.2
5
1
6
6
8
10
0
1
0 B2.1
1 B2.2
1
0
6
0
10
6
1
8
1
5
0
0
6
8
5
3.896
2.27
6.00
3.900
1.085
Overall
Ecl+Exl
CodeCity
5.050
+29.62%
3.00
6.30
5.100
1.031
4.803
2.27
6.50
4.430
1.349
5.968
+24.26%
3.00
8.00
6.065
1.294
Medium
Ecl+Exl
CodeCity
38.809
31.92
53.08
38.000
6.789
33.178
-14.51%
24.67
39.50
35.575
5.545
8.3
Completion Time (minutes)
Large
Overall
Ecl+Exl
CodeCity
Ecl+Exl
CodeCity
44.128
22.83
55.92
48.260
11.483
39.644
-10.16%
27.08
48.55
40.610
6.963
41.048
22.83
55.92
40.080
9.174
Subject Analysis
9.1
After the pilot study involving nine participants, we conducted
the experiment with a total of 45 participants in several runs. After
removing four data points during the outlier analysis, we were left
with 41 subjects, of which 20 industry practitioners (all advanced),
and 21 from academia (of which 9 beginners and 12 advanced). For
each of the 4 treatments, we have 8–12 data points and a fair subject
distribution within the remaining three blocks, as shown in Table 6.
36.117
-12.01%
24.67
48.55
36.125
6.910
Table 7: Descriptive statistics related to correctness and completion time
T1
50.00
60.00
44.13
Outlier Analysis
Before performing our statistical test, we followed the suggestion of Wohlin et al. [21] regarding the removal of outliers caused
by exceptional conditions, to allow us to draw valid conclusions
from the data. During the Bologna I experimental run, one of the
participants assigned with an experimental treatment experienced
serious performance slowdowns due to the low performance of his
computer. Although this participant was not the only one reporting
performance slowdowns, he was by far the slowest as measured
by the completion time and, since this represented an exceptional
condition, we excluded his data from the analysis. Another participant got assigned to an Ecl+Exl treatment, although he did not
have any experience with Eclipse, but with another IDE. For this
reason, this subject took more time in the first tasks than the others,
because of his lack of experience with Eclipse. Since we did not
want to compromise the analysis by disfavoring any of the groups
(i.e., this data point provided the highest completion time and would
have biased the analysis by disadvantaging the control groups), we
excluded also this data point from the analysis.
During the Bologna II run, two participants had compatibility
problems with the virtualization software installed on their machines.
Eventually they were borrowed our two replacement machines, but
due to the meeting room’s tight schedule, we were not able to wait
for them to finish the experiment. We decided to exclude these two
data points from our analysis.
Correctness (points)
Large
Ecl+Exl
CodeCity
6.733
+23.27%
5.00
8.00
6.585
.959
The effect of tool on completion time. There was a significant
main effect of the tool on the completion time F (1, 37) = 4.392,
p = .043, indicating that the mean completion time was significantly lower for CodeCity users than for Ecl+Exl users.
Overall, there was a decrease in completion time of 12.01% for
CodeCity users (M = 36.117, SD = 6.910) over Ecl+Exl users
(M = 41.048, SD = 9.174). In the case of the medium size
system, there was a 14.51% decrease in completion time of CodeCity
users (M = 33.178, SD = 5.545) over Ecl+Exl users (M =
38.809, SD = 6.789), while in the case of the large size system,
there is a 10.16% decrease in completion time for CodeCity users
(M = 39.644, SD = 6.963) over Ecl+Exl users (M = 44.128,
SD = 11.483). The data shows that the time decrease for CodeCity
users over Ecl+Exl users is slightly lower in the case of the large
system compared to that obtained for the medium system.
The effect of system size on completion time. While not the
goal of the experiment, a significant effect of system size on the completion time was observed, F (1, 37) = 5.962, p = .020, indicating
that completion time was significantly lower for users analyzing the
medium system than for users analyzing the large system.
The main effect of both tool and object system size on completion
time and the lack of the effect of interaction between tool and object
system size on completion time, are illustrated in Figure 7, as well
as the completion time box plots for the four treatments.
40.00
39.64
30.00
Tool
Ecl+Exl
CodeCity
38.81
33.18
20.00
40.00
30.00
Tool
10.00
50.00
Ecl+Exl
CodeCity
0.00
20.00
Medium
Large
Object system size
Medium
Large
Object system size
EXAMINE VARIABLES=correctness BY Syssize
/PLOT=BOXPLOT
/STATISTICS=NONE
/NOTOTAL
/ID=Tool.
Figure 7: Means and box plots for completion time
9.3
Result Summary
EXAMINE VARIABLES=correctness BY Syssize BY Tool
/PLOT=BOXPLOT
/STATISTICS=NONE
/NOTOTAL.
Correctness. The data allows us to reject the first null hypothesis
Explore
H10 in favor of the alternative hypothesis H1, which states that the
tool impacts the correctness of the solutions to program comprehension tasks. Overall, CodeCity enabled an increase in correctness of
24.26% over Ecl+Exl. This result is statistically significant.
Completion time. We can also reject the second null hypothesis
H20 in favor of the alternative hypothesis H2, which states that the
tool impacts the time required to complete program comprehension
tasks. Overall, CodeCity enabled a completion time reduction of
12.01% over Ecl+Exl. This result is statistically significant.
Page 5
9.4
Task Analysis
A secondary goal of our experiment was to identify the types of
tasks for which CodeCity provides an advantage over the baseline.
To this end, for each task described in Section 5.5 we compared the
performances—in terms of correctness and time—of the two tools
and reasoned about the potential causes behind the differences. See
Figure 8 for a graphical overview supporting our task analysis5 .
A1 - Identifying the convention used to organize unit tests relatively to the tested classes. While Eclipse performed consistently,
CodeCity outperformed it on the medium system and underperformed it on the large system. The difference in performance is
partially owed to fact that, in spite of the existence of a number of
classes named *Test, there are no unit tests in the large system.
Only a small number of CodeCity users examined the inheritance
relations, while the majority relied only on name matching. The
time is slightly better for the CodeCity subjects, who benefited from
the visual overview of the system, while Eclipse required scrolling
up and down through the package structure.
A2.1 - Determining the spread of a term among the classes.
CodeCity performed marginally better than Eclipse in both correctness and completion time. In CodeCity, once the term search is
completed, the spread can be visually assessed. In Eclipse, the term
search produces a list of class names. For this task the list showed
classes in many packages belonging to different hierarchies and,
therefore, a dispersed spread may represent a safe guess.
A2.2 - Determining the spread of a term among the classes.
Although the task is similar to the previous, the results in correctness
are quite different: CodeCity significantly outperformed Eclipse by
29–38%. The list of classes and packages in Eclipse, without the
context provided by an overview (i.e., How many other packages
are there in the system?) deceived some of the control subjects
into believing that the spread of the term was dispersed, while
the CodeCity users took advantage of the “big picture” and better
identified the localized spread of this term.
A3 - Estimating impact. CodeCity significantly outperformed
Eclipse in correctness by 40–50%, and was slightly faster than
Eclipse. Finding the caller classes of a given class in Eclipse is not
straightforward. Moreover, the resulting list provides no overview.
A4.1 - Identifying the classes with the highest number of methods. In terms of correctness, CodeCity was on a par with Excel for
the medium system and slightly better for the large system. In terms
of completion time, Excel was slightly faster than CodeCity. While
CodeCity is faster at building an approximate overview of systems,
a spreadsheet is faster at finding precise answers in large data sets.
B1.1 - Identifying the package with the highest percentage of
god classes. In both correctness and completion time, CodeCity
slightly outperformed Excel. The low scores in correctness of both
tools shows that none of them is good enough to solve this problem
alone. Instead, CodeCity’s visual presentation and Excel ’s precision
could complement each other well.
Page 9
5
Due to space concerns, we do not discuss task B2.2
tool = CodeCity
15
Block
Experience Level
CodeCity
none
beginner
knowledgeable
advanced
expert
OOP
Experience Level
Eclipse+Excel
none
beginner
knowledgeable
advanced
expert
OOP
Experience Level
CodeCity
none
beginner
knowledgeable
advanced
expert
OOP
Java
0
0
5
9
8
Eclipse
0
3
4
12
3
1
5
10
6
0
Reverse
Engineering
0
10
7
4
1
0
0
8
10
1
Reverse
Engineering
0
7
7
4
1
1
5
10
6
0
Reverse
Engineering
0
10
7
4
1
12
9
6
3
0
Academia
Industry
OOP
Treatment
T2
T3
Beginner
Advanced
Beginner
Advanced
2
2
0
6
3
2
0
7
2
3
0
3
Total
10
12
8
Java
Eclipse
T4
Total
2
9
5
12
0none
0
4beginner 20
Legend:
knowledgeable
11advanced 41
expert
Reverse Engineering
Expertise field
Table 6:
Subject distribution
Moreover, the random assignments of treatment within blocks
led to a balanced distribution of the subjects’ expertise among treatments, as we see in Figure 5.
tool = Ecl+Excl
Ecl&Exl
Java
0
0
2
12
5
Eclipse
0
0
2
13
4
15
12
9
6
3
0
Legend:
none
beginner
knowledgeable
advanced
expert
OOP
Java
tool = CodeCity
Eclipse
0
3
4
12
3
15
12
9
6
3
0
Eclipse
Reverse Engineering
Expertise field
CodeCity
Java
0
0
5
9
8
Legend:
none
beginner
knowledgeable
advanced
expert
OOP
Java
Eclipse
Reverse Engineering
Expertise field
Figure 5: Subjects’ expertise
tool = Ecl+Excl
OOP
Java
0
0
2
12
5
Eclipse
0
0
2
13
4
0
0
8
10
1
9.
Reverse
Engineering
0
7
7
4
1
12
9
6
3
0
RESULTS
7.00
6.00
none
beginner
knowledgeable
advanced
expert
Based on the
designJava
of our experiment,
i.e., a between-subjects,
OOP
Eclipse
Reverse Engineering
unbalanced (i.e., implying
unequal
sample sizes) design with two
Expertise
field
independent variables (tool and system size), the suitable parametric test for hypothesis testing is a two-way Analysis Of Variance
(ANOVA). We performed the test for correctness and completion
time using the SPSS statistical package. Before the analysis, we
ensured that our data met the test’s assumptions:
1. Independence of observations. This assumption is implicitly
met through the choice of a between-subjects design.
2. Homogeneity of variances of the dependent variables. We
tested our data for homogeneity of both correctness and completion time, using Levene’s test [5] and in both cases the
assumption was met.
3. Normality of the dependent variable across levels of the independent variables. We tested the normality of correctness
and completion time across the two levels of tool and object
system size using the Shapiro-Wilk test for normality [11],
and also this assumption was met in all the cases.
We chose a significance level of .05 (↵ = .05), which corresponds
to a 95% confidence interval. The statistics related to correctness
and completion time are presented in Table 7.
A1
A2.1
0.10
1.00
0.50
0.55
6.73
Ecl+Exl
CodeCity
7.00
5.05
6.00
3.90
3.00
5.00
4.00
2.00
Tool
Ecl+Exl
CodeCity
1.00
3.00
2.00
0.00
Medium
Medium
Large
Large
Object system size
Object system size
EXAMINE VARIABLES=time BY Syssize BY Tool
/PLOT=BOXPLOT
Figure 6: Means and box/STATISTICS=NONE
plots for correctness
/NOTOTAL.
Explore
Analysis Results on Completion Time
Interaction effect between tool and system size on completion
time. Similarly, it is important that there is no interaction between
the two factors, which could have affected the completion time. The
interaction effect of tool and system size on completion time was
not significant, F (1, 37) = .057, p = .813. The data shows no
evidence that the variation in completion time between CodeCity
and Ecl+Exl depends on the size of the system, which strengthens
any observed effect of the tool factor on the completion time.
Page 4
1.00
1.00
A3
0.75
0.98
0.46
0.62
A4.1
0.54
0.62
0.04
0.26
Page 7
A4.2
1.00
1.00
0.75
1.00
B1.1
0.00
0.11
0.67
1.00
B1.2
0.10
0.33
0.00
0.17
B2.1
0.90
0.83
0.75
1.00
0
1.00 1.00
0.60
1.00
0.83
0
A2.2
A3
A4.2
1.00
0.75
B1.2
B2.1
0.80
0.75
0.67
0.63
0.54
0.10
0.04
A2.2
A3
T2
CodeCity installation with a loaded model
of the system to be analyzed, and the source
code of the object system, accessible from
the visualizations.
Ecl+Exl
T3
T4
Eclipse installation with default development tools, an Eclipse workspace containing a project with the entire source code of
the object system, an Excel installation, and
a sheet containing all the metrics and design
problem data required for solving the tasks
and available to the experimental groups.
0
A4.1
A4.2
0
B1.1
B1.2
B2.1
Treatment Description
5.5
Id
Task
Concern
A1
Description. Locate all the unit test classes of the system and identify the convention (or lack therof) used by the developers to organize the tests.
Rationale. Test classes are typically defined in packages according to a project-specific convention. Before integrating their work in the system, developers
need to understand how the test classes are organized. Software architects design the high-level structure of the system (which may include the convention
by which test classes are organized), while quality assurance engineers monitor the consistency of applying these rules in the system.
Structural
understanding
A2.1 Description. Look for the term T1 in the names of classes and their attributes and methods, and describe the spread of these classes in the system.
Rationale. Assessing how domain knowledge is encapsulated in source code is important in several scenarios. To understand a system they are not familiar
with, developers often start by locating familiar domain concepts in the source code. Maintainers use concept location on terms extracted from change
requests to identify where changes need to be performed in the system. Software architects want to maintain a consistent mapping between the static
structure and the domain knowledge. Each of these tasks starts with locating a term or set of terms in the system and assess its dispersion.
A2.2 Description. Look for the term T2 in the names of classes and their attributes and methods, and describe the spread of these classes in the system.
Rationale. Same as for task A2.1. However, the term T2 was chosen such that it had a different type of spread than T1.
A3
http://www.virtualbox.org
A2.1
7.22
4.82
7.29
5.39
0
A2.2
5.49
3.53
6.51
4.34
A
A3
2.95
4.76
3.56
5.54
A4.1
7.66
6.32
7.95
7.75
A4.2
5.63
4.41
5.14
4.30
B1.1
9.35
8.74
7.33
4.50
B1.2
6.86
4.94
7.13
6.23
Concept
location
Metricbased
analysis
A4.2 Description. Find the three classes with the highest average number of lines of code per method in the system.
Rationale. It is difficult to prioritize candidates for refactoring from a list of large classes. In the absence of other criteria, the number and complexity of
methods can be used as a measure of the amount of functionality for solving this problem related to maintenance and quality assurance.
Metricbased
analysis
B1.1 Description. Identify the package with the highest percentage of god classes in the system.
Rationale. God classes are classes that tend to incorporate an overly large amount of intelligence. Their size and complexity often make them a maintainer’s
nightmare. Keeping these potentially problematic classes under control is important. By maintaining the ratio of god classes in packages to a minimum, the
quality assurance engineer keeps this problem manageable. For a project manager, in the context of the software process, packages represent work units
assigned to the developers. Assessing the magnitude of this problem allows him to take informed decisions in assigning resources.
Focused
design
assessment
B1.2 Description. Identify the god class containing the largest number of methods in the system.
Rationale. It is difficult to prioritize candidates for refactoring from a list of god classes. In the absence of other criteria (e.g., the stability of a god class
over its evolution), the number of methods can be used as a measure of the amount of functionality for solving this problem related to maintenance and
quality assurance.
Focused
design
assessment
B2.1 Description. Identify the dominant class-level design problem (the design problem that affects the largest number of classes) in the system.
Rationale. God class is only one of the design problems that can affect a class. A similar design problem is the brain class, which accumulates an excessive
amount of intelligence, usually in the form of brain methods (i.e., methods that tend to centralize the intelligence of their containing class). Finally, data
classes are just “dumb” data holders without complex functionality, but with other classes strongly relying on them. Gaining a “big picture” of the design
problems in the system would benefit maintainers, quality assurance engineers, and project managers.
Holistic
design
assessment
B2.2 Description. Write an overview of the class-level design problems in the system. Describe your most interesting or unexpected observations.
Rationale. The rationale and targeted user roles are the same as for task B2.1. However, while the previous one gives an overview of design problems in
figures, this task provides qualitative details and has the potential to reveal the types of additional insights obtained with visualization over raw data.
Holistic
design
assessment
6.
OPERATION
2009
November
December
18 24 25
2
9
21
2010
February
January
28
5
8
14
28
18
..
April
22 24 25
Pilot
14
Experiment
Lugano
Tasks
10.00
Concept
location
Description. Evaluate the change impact of class C by considering its caller classes (classes invoking any of its methods). The assessment is done in terms
of both intensity (number of potentially affected classes) and dispersion (how these classes are distributed in the package structure).
Rationale. Impact analysis allows one to estimate how a change to a part of the system impacts the rest of the system. Although extensively used in
maintenance activities, impact analysis may also be performed by developers when estimating the effort needed to perform a change. It also gives an idea
of the quality of the system: A part of the system which requires a large effort to change may be a good candidate for refactoring.
A4.1 Description. Find the three classes with the highest number of methods (NOM) in the system.
Rationale. Classes in object-oriented systems ideally encapsulate one single responsibility. Since methods are the class’s unit of functionality, the number
of methods metric is a measure of the amount of functionality of a class. Classes with an exceptionally large number of methods make good candidates for
refactoring (e.g., split class), and therefore are of interest to practitioners involved in either maintenance activities or quality assurance.
Table 5: Tasks
Our approach provides aid in comprehension tasks supporting
adaptive and perfective maintenance. We tried to use the previouslydefined maintenance task definition frameworks by Pacione et al.
[8] and by Sillito et al. [12] to design the tasks of our evaluation.
However, both frameworks proved ill-suited. Since CodeCity relies
exclusively on static information extracted from the source code, it
was not realistic to map our tasks over Pacione’s model, which is
biased towards dynamic information visualization. Sillito’s wellknown set of questions asked by developers, although partially
compatible with our tasks, refers to developers exploring source
code only. Our approach supports software architects, designers,
quality-assurance engineers, and project managers, in addition to
developers. These additional roles assess systems at higher levels of
abstraction not covered by Sillito’s framework.
We decided to use these works as inspirations and defined our own
set of tasks, detailed in Table 5, dealing with various maintenance
concerns and split into program comprehension (A) and design
quality assessment (B).
A1
...happily(
ever(aﬅer
2. we discuss the results of the experiment, which show that
our approach is a viable alternative to existing non-visual
techniques.
Structure of the paper. In Section 2 we describe our software
visualization approach. In Section 3 we present the related work,
followed by a list of desiderata in Section 4, extracted from an
extensive study of the existing body of research. In Section 5 we
describe the design of our experiment and in Section 6 we detail the
experiment’s operational phase. In Section 7 we present how we
collected the data on which we performed the analysis presented
in Section 8. In Section 9 we present the results of the experiment,
followed by a presentation of the threats to validity in Section 10,
and the conclusions in Section 11.
Table 4: Treatments
We provided the treatments as virtual images for VirtualBox4 ,
which was the only piece of software required to be installed by the
participants. Each virtual image contained only the necessary pieces
of software (i.e., either CodeCity or Eclipse+Excel), installed on a
Windows XP SP2 operating system.
2.00
B1.1
0.90
0.75
0.46
Findbugs
T1
4.00
0.17
0.86
A2.1
Tool
8.00
A4.1
tool = CodeCity
1.00
0.78
0.10
A1
Azureus
CodeCity
6.00
0.11
A2.1
0.91
0.33
0.26
A1
0.50
0.40
0.20
Treatments
Completion time (minutes)
1.00
0.62
0.62
0.55
1.00
0.80
5.4
By combining the two levels of each of the two independent
variables we obtain four treatments, illustrated in Table 4.
Time
T1: CodeCity, large
T2: CodeCity, medium
T3: Ecl+Excl, large
T4: Ecl+Excl, medium
0.80
1.00
0.63
0.91
tool = CodeCity
0.98
0.95 0.96
0.80
0.60
0.40
Controlled Variables
We identified two factors that could have an influence on the subjects’ performance, i.e., their background and experience level. The
background represents the working context of a subject, which we
divided into industry and academy. The second factor is experience
level, which represents the domain expertise gained by each of the
participants, divided into beginner and advanced. For academia,
subjects below a PhD were considered beginners, while researchers
(i.e., PhD students, post-docs and professors) were considered advanced. For industry, we considered that participants with up to
three years of experience were beginners, and the rest advanced.
We used a randomized block design, with background and experience level as blocking factors. We assigned each participant—based
on personal information collected before the experiment—to one
of the four categories (i.e., academy-beginner, academy-advanced,
industry-beginner, and industry-advanced). We then randomly assigned one of the four treatments (i.e., combinations of tool and
system size) to the participants in each category.
4
0.20
tool = Ecl+Excl
Tool
8.00
5.46
4.00
A2.2
0.86
0.96
0.78
0.95
tool = Ecl+Excl
Medium
object
system
5.00
time
9.2
Azureus
454’387
520
4’656
111
55
256
http://findbugs.sourceforge.net
http://azureus.sourceforge.net
Analysis Results on Correctness
Interaction effect between tool and system size on correctness. It is important that there is no interaction between the two
factors, which could have affected the correctness. The interaction
effect of tool and system size on correctness was not significant,
F (1, 37) = .034, p = .862. The data shows no evidence that the
variation in correctness between CodeCity and Ecl+Exl depends on
the size of the system, which strengthens any observed effect of the
tool factor on the correctness.
The effect of tool on correctness. There was a significant main
effect of the tool on the correctness of the solutions, F (1, 37) =
14.722, p = .001, indicating that the mean correctness score obtained by CodeCity users was significantly higher than the one for
Ecl+Exl users, regardless of the size of the object system.
Overall, there was an increase in correctness of 24.26% for
CodeCity users (M = 5.968, SD = 1.294) over Ecl+Exl users
(M = 4.803, SD = 1.349). In the case of the medium size
system, there was a 23.27% increase in correctness of CodeCity
users (M = 6.733, SD = .959) over Ecl+Exl users (M = 5.462,
SD = 1.147), while in the case of the large size system, the increase in correctness was 29.62% for CodeCity users (M = 5.050,
SD = 1.031) over Ecl+Exl users (M = 3.896, SD = 1.085).
The data shows that the increase in correctness for CodeCity over
Ecl+Exl was higher for the larger system.
The effect of system size on correctness. Although not the object of the experiment, an expected significant main effect of system
size on the correctness of the solutions was observed, F (1, 37) =
26.453, p < .001, indicating that the correctness score was significantly higher for users performing the analysis on the medium
size system than for users performing the analysis on the large size
system, regardless of the tool they used to solve the tasks.
The main effect of both tool and object system size on correctness
and the lack of the effect of interaction between tool and object
system size on correctness are illustrated in Figure 6, as well as the
correctness box plots for the four treatments.
While some of the subjects assigned to CodeCity have little or
no experience with Eclipse, every subject assigned to Ecl+Exl is at
least knowledgeable
in using this IDE.
15
Legend:
Experience Level
Eclipse+Excel
none
beginner
knowledgeable
advanced
expert
FindBugs
93’310
53
1’320
62
9
67
Table 3: The two object systems
2
3
A3
5
12
3
1
0
Correctness
A2.2
7
10
4
0
0
Correctness
A2.1
7
7
3
4
0
Frequency
A1
tool = CodeCity
Frequency
Preliminary Data Analysis
Frequency
Correctness Data
To convert the collected task solutions into quantitative information, we needed an oracle, which would provide both the set of
correct answers and the grading scheme for each task. Having two
object systems and two data sources (source code for the control
group and a model of the system for the experimental group), led
to four oracle sets. To build them, two of the authors and a third
experimenter solved the tasks with each treatment and designed the
grading scheme for each task. In addition, for the two experimental
treatments, we computed the results using queries on the model of
the object system, to make sure that we were not missing any detail
because of particularities of the visual presentation (e.g., occlusion,
too small buildings, etc.). Eventually, we discussed the divergences
and merged our solutions.
Finally, we needed to grade the solution of the subjects. We
employed blind marking to rule out bias; when grading a solution
the experimenter does not know whether the subject that provided
the solution has used an experimental or a control treatment. For
this, one of the authors created four code names for the groups and
created a mapping between groups and code names, known only
to him. Then he provided the other two experimenters with the
encoded data, along with the encoded corresponding oracle, which
allowed them to perform blind grading. In addition, the author who
encoded the data performed his grading unblinded. Eventually, the
authors discussed the differences and converged towards the final
grading. The minimum grade was 0, while the maximum was 8.
DATA ANALYSIS
Difficulty
trivial
simple
intermediate
difficult
impossible
Frequency
7.3
8.
8.1
We observed an exceptional condition related to task A4.2, which
had the highest discrepancy in time and correctness between control
and experimental groups. The data points showed that the experimental group was not able to solve this task, while the control group
was quite successful at solving it. The experimental group had an
average of 0.06, with 19 null scores (out of 22), and most subjects
used up the entire allotted
time
ceiling effect).
The control
trivial
simple (i.e.,
intermediate
difficult
impossible
group had
an average of 0.86 with 15 perfect scores (out of 19) and
12
9
most subjects
finished in roughly half the allotted time. The subjects
6
3 its difficulty accordingly: In the debriefing questionnaire,
perceived
0
experimental
graded
“impossible”,
while
the
A1 subjects
A2.1
A2.2
A3 task
A4.1 A4.2
A4.2 asB1.1
B1.2
B2.1
B2.2
Task
control group described it as “simple”,
as shown in Figure 4.
Completion time
Timing Data
Completion time
Personal Information
Frequency
DATA COLLECTION
Frequency
Frequency
Frequency
7.2
Frequency
Frequency
7.1
Object Systems
We chose two Java systems, both large enough to potentially
benefit from visualization, yet of different size, so that we can
reason about this independent variable. The smaller of the two
systems is FindBugs2 , a tool using static analysis to find bugs in
Java code, developed as an academic project at the University of
Maryland, while the larger system is Azureus3 , a popular P2P file
sharing client and one of the most active open-source projects hosted
at SourceForge. In Table 3, we present the main characteristics of
the two systems related to the tasks of the experiment.
Dependent and Independent Variables
The purpose of the experiment is to show whether CodeCity’s
3D visualizations provide better support to software practitioners in
solving program comprehension tasks than state-of-the-practice exploration tools. Additionally, we are interested in how well CodeCity
performs compared to the baseline when analyzing systems of different magnitudes. Consequently, our experiment has two independent
variables: the tool used to solve the tasks and the object system
size. The tool variable has two levels, i.e., CodeCity and a baseline,
chosen based on the criteria described in Section 5.2.1. The object
system size has two levels, i.e., medium and large, because visualization starts to become useful only when the analyzed system has a
reasonable size. The object systems chosen to represent these two
treatments are presented in Section 5.2.2.
Similarly to other empirical evaluations of software visualization
approaches [3, 10, 1], the dependent variables of our experiment
are correctness of the task solution (i.e., a measure of effectiveness)
and completion time (i.e., measure of efficiency). The design of our
experiment is a between-subjects design, i.e., a subject is part of
either the control group or of the experimental group.
Correctness
T1: CodeCity, large
T2: CodeCity, medium
T3: Ecl+Excl, large
T4: Ecl+Excl, medium
7.
We collected data continuously—before, during, and after the
experiment.
Before the experiment, we collected both personal information
(e.g., gender, age, nationality) and professional data (e.g., job position, experience with object-oriented programming, Java, Eclipse,
and reverse engineering) by means of an online questionnaire, to be
found in [20].
To time participants accurately, we developed our own timing
web application in Smalltalk. During the experimental sessions, the
timing application would run on the experimenter’s computer and
project the current time.
The displayed time was used as common reference by the participants whenever they were required in the questionnaire to log the
time. In addition, the application displayed, for each participant, the
name, current task, and the remaining time allotted for the task.
The subjects were asked to announce to the experimenter every
time they logged the time, so that the experimenter could reset
their personal timer by clicking on the hyperlink marked with the
name of the subject. Whenever a subject was unable to finish a
task in the allotted time (10 minutes for each task), the application
would display the message “overtime” beside the name, and the
experimenter would ask the subject to immediately pass to the next
task, before resetting the timer.
5.3
Finding a Baseline
There is a subtle interdependency between the baseline and the
set of tasks for the experiment. In an ideal world, we would have
devised tasks for each of the three contexts in which we applied
our approach: software understanding, evolution analysis, and design quality assessment. Instead, we had to settle for a reasonable
compromise. We looked for two characteristics in an appropriate
baseline: data & feature compatibility with CodeCity and recognition from the community (i.e., a state-of-the-practice tool).
Unfortunately we could not find a single tool satisfying both
criteria. To allow a fair comparison, without having to limit the
task range, we opted to build a baseline from several tools. The
baseline needed to provide exploration and querying functionality,
support for presenting at least the most important software metrics,
support for design problems exploration, and if possible support for
evolutionary analysis.
In spite of the many existing software analysis and visualization
approaches, software understanding is still mainly performed at the
source code level. Since the most common source code exploration
tools are integrated development environments (IDEs), we chose
Eclipse, a popular IDE in both academia and industry, which represents the current state-of-the-practice. The next step was finding
support for exploring meta-data, such as software metrics and design
problem data, since they were not available in Eclipse. We looked
for a convenient Eclipse plugin for metrics or even an external metrics tool, but could not find one that fit our requirements (including
support for user defined metrics). Since we did not want to confer
an unfair data advantage to the subjects in the experimental group,
we chose to provide the control group with tables containing the
metrics and design problem data, and the popular Excel spreadsheet
application for exploring the data.
Finally, due to Eclipse’s lack of support for multiple versions,
we decided to exclude the evolution analysis from our evaluation,
although we consider it one of the strong points of our approach.
Providing the users with several projects in Eclipse representing
different versions of the same system, with no relation among them
(or even worse, with just a versioning repository), would have been
unfair.
Q1 : Does the use of CodeCity increase the correctness of the
solutions to program comprehension tasks, compared to nonvisual exploration tools, regardless of the object system’s
size?
Q2 : Does the use of CodeCity reduce the time needed to solve
program comprehension tasks, compared to non-visual exploration tools, regardless of the object system’s size?
3. Take into account the range of experience level of the participants. Experience can greatly influence the outcome of
the experiment.
5. Find a set of relevant tasks. The tasks should be close in
scope and complexity to real tasks performed by practitioners.
Object system(s)
Classes
kLOC
Wettel et al. [20]
Figure 2: Example of disharmony map (a part of JDK 1.5)
5.1
2. Involve participants from industry. Any approach devised
to support practitioners in their work is best evaluated using a
subject sample with a fair share of software practitioners.
4. Provide a tutorial of the experimental tool to the participants. The experimental group would require intensive training to even come close to the skills of the control group
acquired in years of operation. Therefore, a minimal exercising of the experimental group in using the features required
to solve the tasks is paramount.
java.utils
CodeCity also supports software evolution analysis activities [18],
which for reasons we explain later we did not evaluate.
Tool support. Our approach is implemented in CodeCity [17],
a freely available1 standalone tool programmed in Smalltalk and
depicted in Figure 1. Apart from the main visualization panel, there
is an information panel providing contextual data on the selected
city artifact (e.g., name, metric values, etc.). CodeCity supports a
broad range of facilities to interact, such as inspection or querying,
and navigate the visualization. Although not integrated in an IDE,
CodeCity features unidirectional linking between the visualization
and the source code, i.e., the source code of the software element
represented by a city artifact can be accessed in an editor from the
visualization. CodeCity is customizable: the user can compose a new
view by choosing from a set of metrics and artifact representations.
1
1. Choose a fair baseline for comparison. If one wants to
provide evidence for the value of an approach the question of
what one compares it to has to be answered unambiguously.
NOM: 71
NOA: 81
Experiment
EXPERIMENTAL DESIGN WISH LIST
We conducted a very exhaustive survey of research works dealing
with experimental validation of software engineering, information
visualization, and software visualization approaches. The survey,
which included more than 50 articles and various books, is detailed
in a technical report [20]. From the survey we distilled the following
experimental design wish list, which we kept in mind as we designed
and conducted the experiment:
2.
Software Systems as Cities: A Controlled Experiment
Software visualization is a popular program comprehension technique used in the context of software maintenance, reverse engineering, and software evolution analysis. While there is a broad range
of software visualization approaches, only few have been empirically evaluated. This is detrimental to the acceptance of software
visualization in both the academic and the industrial world.
of a 3D software visualization approach based on a city metaphor
and implemented in a tool called CodeCity. The goal is to provide
experimental evidence of the viability of our approach in the context
of program comprehension by having subjects perform tasks related
to program comprehension. We designed our experiment based on
lessons extracted from the current body of research. We conducted
the experiment in four locations across three countries, involving
41 participants from both academia and industry. The experiment
shows that CodeCity leads to a statistically significant increase in
terms of task correctness and decrease in task completion time. We
detail the experiment we performed, discuss its results and reflect
on the many lessons learned.
PLEIAD @ DCC
University of Chile
rrobbes@dcc.uchile.cl
It has earned a reputation as an effective program comprehension
technique, and is widely used in the context of software maintenance,
reverse engineering and reengineering [2].
The last decade has witnessed a proliferation of software visualization approaches, aimed at supporting a broad range of software
engineering activities. As a consequence, there is a growing need for
objective assessments of visualization approaches to demonstrate
their effectiveness. Unfortunately, only few software visualization
approaches have been empirically validated so far, which is detrimental to the development of the field [2].
One of the reasons behind the shortage of empirical evaluation lies
in the difficulty of performing a controlled experiment for software
visualization. On the one hand, the variety of software visualization
approaches makes it almost impossible to reuse the design of an
experiment from the current body of research. On the other hand, organizing and conducting a proper controlled experiment is, in itself,
a difficult endeavor, which can fail in many ways: data which does
not support a true hypothesis, conclusions which are not statistically
significant due to insufficient data points, etc.
of a software visualization approach based on a city metaphor [16].
The aim is to show that our approach, implemented in a tool called
CodeCity, is at least as effective and efficient as the state of the
practice at supporting reverse engineering and program comprehension activities. We conceived a set of tasks and measured both the
correctness of the task solutions and the task completion time. We
conducted the experiment in four locations across three countries,
with participants from both industry and academia, with a broad
range of experience.
In this paper we make the following two major contributions:
1. we detail the experiment design and its operation, reporting
on a number of lessons learned regarding the many pitfalls
that this type of experiment entails, and
Software visualization, Empirical validation
Permission to make digital or hard copies of all or part of this work for
personal or classroom use is granted without fee provided that copies are
not made or distributed for profit or commercial advantage and that copies
bear this notice and the full citation on the first page. To copy otherwise, to
republish, to post on servers or to redistribute to lists, requires prior specific
permission and/or a fee.
ICSE ’11, May 21–28 2011, Honolulu, Hawaii
Copyright 2011 ACM 978-1-4503-0445-0/11/05 ...$10.00.
ABSTRACT
Romain Robbes
REVEAL @ Faculty of Informatics
University of Lugano
{richard.wettel,michele.lanza}@usi.ch
!
B2.1
2.14
1.88
2.65
2.10
1.69
2.52
3.91
3.14
1 3
1 1 1
1
1
1
1
1
3
2
1
R
R
1
1 1
R
6
1
Antwerp
n
m
R
remote, self-controlled session
5
6
Bern
4
6
During the experimental session(s), the subjects solve the tasks
with the assigned tool on the assigned object system, under the
experimenter’s observation. The numbers in Figure 3 reflect only
the participants whose data points were taken into account during
the analysis, and not those excluded from it, which we discuss later.
6.1
Bologna
Training session
followed by an experimental session with
n subjects with CodeCity (experimental)
m subjects with Eclipse+Excel (control)
Legend:
R
1
Figure 3: The timeline of the experiment
The operation covered the period Nov 2009–Apr 2010 and was
divided in two phases: the pilot and the experiment. Figure 3
shows the timeline of the operation. The operation is composed of a
series of experimental runs; each run consists of a one hour training
session, followed by one or more experimental sessions of up to two
hours. A training session consists of a talk in which the experimenter
presents the approach, concluded with a CodeCity demonstration.
The Pilot Study
We conducted a pilot study with Master students of the University
of Lugano enrolled in a course on Software Design; improving
the questionnaire and solving problems as they emerged required
several iterations. We organized a joint training session, followed
by three experimental sessions, each one week apart. Before the
first session we conducted the experiment with a researcher from
our group, with extensive experience with Eclipse, to make sure the
tasks for the control group were doable in the allotted time. We did
not include any of these data points in the analysis.
6.2
Experimental Runs
At this point, we were confident enough to start our experiment.
We performed the following experimental runs:
Bologna I. 8 professionals with 4–10 years of experience.
Bologna II. 9 professionals with 7–20 years of experience.
Lugano I. 1 researcher/development leader of a small company.
Lugano II & III. 5 practioners with 10+ years of experience.
Antwerp. 3 Ph.D. and 8 MSc students.
Bern. 2 consultants, 1 professor, 7 Ph.D. and 1 MSc student.
...happily!
ever after?
5,143 citations
5,143 citations
!
S
L
L
u
!
B
T
*
*
h-index 37
Impact
>
Manhattan
!
manhattan.inf.usi.ch
Academics @ Work
conflict
Courtesy of
Adrian Kuhn & Oscar Nierstrasz
(Univ. of Bern, Switzerland)
!
Eclipse
Courtesy of
Claus Lewerentz & Frank Steinbrückner
(TU Cottbus, Germany)
!
Academic Research
Part III
Transcendence
Industrial Reality
review
rearchitect
version
tasks
design
edit
edit
reuse
run
compile
RSS
document
run
Twitter
IRC
debug Skype
diﬀ
Facebook
recommend
refactor
navigate
visualize
blog
compile
e-mail
test reengineer
!
!
!
!
for(int j=m; j>i; j--){
uCJM1= dataUC[j-1];
uCJ= dataUC[j];
if(uCJM1.compare(z)>
{ /* exchange */
tempStr= data[j-1];
/* sort the data */
data[j-1]= data[j];
data[j]= tempStr;
dataUC[j-1]= uCJ;
dataUC[j]= uCJM1;
for(int j=m; j>i; j--){
uCJM1= dataUC[j-1];
uCJ= dataUC[j];
if(uCJM1.compare(z)>
{ /* exchange */
tempStr= data[j-1];
/* sort the data */
data[j-1]= data[j];
data[j]= tempStr;
dataUC[j-1]= uCJ;
dataUC[j]= uCJM1;
}
}
}
}
!
!
!
!
!
(1)
(2)
(3)
(4)
(5)
(6)
!
(7)
(8)
(9)
(10)
(11)
!
Alice wrote:
On Mon 23, Bob wrote:
Dear list,
When starting up ArgoUML on my MacOS X system (Java 2)
it throws a NullPointerException very soon. You'll find the
trace below. I hope someone knows a solution. Thanks a lot!
Exception in thread "main" java.lang.NullPointerException
at
javax.swing.event.SwingSupport.fireChange(SwingChange.java)
at javax.swing.AbstractAction.setEnabled(AbstractAction.java)
[...]
at uci.uml.Main.main(Main.java:148)
(12)
(13)
(14)
(15)
I'm sorry I can't help you Bob but thanks for sharing the stack...
Alice.
-"Beware of programmers who carry screwdrivers." --L. Brandwein
(16)
(17)
(18)
(19)
(20)
Alice, I believe the flawed Explorer.java class generates Bob's issue:
public void setEnclosingFig(Fig each) {
super.setEnclosingFig(each);
if (each != null || (each.getOwner() instanceof MPackage)) {
m = (MPackage) each.getOwner(); }
(21)
(22)
(23)
(24)
(25)
(26)
(27)
(28)
The problem is in the condition, I attach the diff with this version:
--- src/org/argouml/ui/explorer/Explorer.java
(revision 14338)
+++ src/org/argouml/ui/explorer/Explorer.java (working copy)
@@ -147,1 +147,1 @@
[...]
- if (each != null || (each.getOwner() instanceof MPackage)) {
+ if (each != null && (each.getOwner() instanceof MPackage)) {
(29)
(30)
(31)
(32)
(33)
(34)
Probably ModelTree is also affected, if so, please change it =)
Cheers, Carl.
-- I used to have a sig, but it took up much space so I got rid of it!
--------------------------------------------------------------------To unsubscribe, e-mail: dev-...@argouml.tigris.org
For additional commands, e-mail: dev-...@argouml.tigris.org
!
!
!
Non-relevant
Natural Language
Stack trace
Source code
Patch
The content of unstructured data
produced during the evolution of a
software system is a valuable information
source to support software understanding
and evolution and complements data
mined from structured sources.
!
Alberto Bacchelli
!
Mining Unstructured Software Data
Alice wrote:
> On Mon 23, Bob wrote:
> > Dear list,
> > When starting up ArgoUML on my MacOS X system (Java 2)
> > it throws a NullPointerException very soon. You'll find the
> > trace below. I hope someone knows a solution. Thanks a lot!
>
>
>
>
>
>
>
>
Exception in thread "main" java.lang.NullPointerException
at
javax.swing.event.SwingSupport.fireChange(SwingChange.java)
at javax.swing.AbstractAction.setEnabled(AbstractAction.java)
[...]
> > at uci.uml.Main.main(Main.java:148)
>
>
>
>
I'm sorry I can't help you Bob but thanks for sharing the stack...
Alice.
-"Beware of programmers who carry screwdrivers." --L. Brandwein
Alice, I believe the flawed Explorer.java class generates Bob's issue:
public void setEnclosingFig(Fig each) {
if (each != null || (each.getOwner() instanceof MPackage)) {
The problem is in the condition, I attach the diff with this version:
--- src/org/argouml/ui/explorer/Explorer.java
(revision 14338)
+++ src/org/argouml/ui/explorer/Explorer.java (working copy)
@@ -147,1 +147,1 @@
[...]
- if (each != null || (each.getOwner() instanceof MPackage)) {
+ if (each != null && (each.getOwner() instanceof MPackage)) {
Probably ModelTree is also affected, if so, please change it =)
Cheers, Carl.
-- I used to have a sig, but it took up much space so I got rid of it!
--------------------------------------------------------------------To unsubscribe, e-mail: dev-...@argouml.tigris.org
For additional commands, e-mail: dev-...@argouml.tigris.org
Recommender Systems for
Software Engineering
“RSSEs are software applications that
provide information estimated to be
valuable for a software engineering task
in a given context”
!
PhD Thesis, University of Lugano, 2013
M. P. Robillard, R. J. Walker, and T. Zimmerman
Recommender systems for software engineering
IEEE Software, 2010
!
Next-gen Development Environments + Recommender Systems
!
No Spontaneous
Recommendation
!
!
!
!
!
!
No
Self-Confidence
!
!
!
Intelligent/Immersive
Development Environments
>
Prompter
!
prompter.inf.usi.ch
Make Weapons out of Imperfections
The Librarian daemon looks like a
pleasant, fiftyish, silver-haired, bearded
man.
Even though he's just a piece of
software, the librarian has reason to be
cheerful; he can move through the
nearly infinite stacks of information in the
Library with the agility of a spider dancing across a
vast web of cross-references. The only thing he can't do
is think.
“Yes, sir," the Librarian says. He is eager without
being obnoxious, he clasps his hands behind his back,
rocks forward slightly on the balls of his feet, raises
his eyebrows expectantly over his half-glasses.
80
60
Completeness
●
40
is eﬀective in
development tasks
20
Prompter
100
Development Task
NP
P
Treatment
Librarians are there to:
help; aid; assist; teach; collate; enthuse; catalogue;
index; arrange; organize; find; discover; promote;
display; interest; intrigue; amuse; amaze; help
children, adults, old people, the underprivileged, the
rich, the poor, those with voices and those without;
protect resources, archive them and save them for the
future; provide differing viewpoints; engender thought,
conversation, fun; provide the best answers possible
and match the answer to the enquirer; provide just
enough information without overwhelming the user.
!
NP = Without Prompter
P = With Prompter
Google is there to:
make money.
The End
!
…of the
Beginning
old habits
die hard
Denn wir sind wie Baumstämme im
Schnee. Scheinbar liegen sie glatt auf,
und mit kleinem Anstoss sollte man sie
wegschieben können. Nein, das kann
man nicht, denn sie sind fest mit dem
Boden verbunden. Aber sieh, sogar
das ist nur scheinbar.
deicIDE
!
The Rise and Fall of
Integrated Development Environments
!
Michele Lanza
!
!
Franz Kafka, “Die Bäume”, 1913
REVEAL | Faculty of Informatics
University of Lugano, Switzerland
!

deicide Absurdistan Bankistan

Transcription

Similar documents

Mary Ellen Smith Glasgow, PhD, RN, ACNS-BC

free-body diagrams - Universidad Interamericana de Puerto Rico

Benefit Highlights - Inter Valley Health Plan

Canada Games

Motaung was ihot three tunes by the security police. He was only

PDF

Spring/Summer 2013 - Arkansas Northeastern College

12752_Collins ks2 RG English_p12

Here`s - American Studies Department at The University of Bucharest

Fall/Winter 2013 - Arkansas Northeastern College