** Citation Count Analysis Script **
** Author: Brian Nosek **
** Creation Date: 14 August 2009 **
** Last revision: 26 March 2010 **;
libname web 'C:\primary\Data\Citations\';
data citations;
informat lastname $25.;
informat firstname middlen whosearched $15.;
informat gender ethnicity $1.;
informat univ $40.;
informat searchdate $11.;
infile "C:\primary\Data\Citations\individuals.txt" delimiter='09'x firstobs=2;
input lastname $ firstname $ middlen $ gender $ ethnicity $ PhDyear Univ $ SearchDate $
WhoSearched $ CleanComplexity TotalCites PubYears hindex eindex hmindex WOStotal updated;
YsincePhD = 2010 - PhDyear; *2009 PhD's marked as 1 year;
if updated = 1 then YsincePhD = YsincePhD + 0.5; *for new and updated searches for PSPB revision;
if univ = "Brown" then univ = "Brown University"; *correcting typo;
informat wholename $40.;
wholename = trim(lastname)||", "||trim(firstname);
informat pyear $4.;
pyear = phdyear;
informat nameinst $80.;
nameinst = trim(firstname)||" "||trim(lastname)||", "||trim(univ)||" ("||trim(Pyear)||" PhD)";
logcite = log(totalcites);
logh = log(hindex);
loge = log(eindex);
loghm = log(hmindex);
if loghm < 0 then loghm = 0; *retain absolute zero, corrects one outlier with an h of 1;
if loge = . then loge = 0; *corrects one outlier with an e of 0;
* CORE INDIVIDUAL INDICIES;
** CUMULATIVE IMPACT INDICIES;
* TotalCites = total citation count from Publish or Perish analysis;
* hindex = h calculation based on Publish or Perish analysis, Hirsch 2005;
* hmindex = fractionalized h correcting for number of co-authors, correlated >.95 with h;
* eindex = very similar to and correlates extremely highly with total citation count;
* Icumulative = average of above four after standardizing with mean of 0 and std of 1;
** CAREER-STAGE IMPACT INDICIES;
* byYcite = TotalCites score based on Years since PhD: log(total citations) corrects for heteroscedasicity, then
subtract that value from expected log citation value for years since PhD based on regression;
* byYh = hindex score based on Years since PhD: log(h-index) corrects for heteroscedasicity to increase comparability
across years, then subtract that value from expected log citation value for years since PhD based on regression;
* byYe - same;
* byYhm - same;
* Icareerstage = average of above four after standardizing with mean of 0 and std of 1;
* CORE DEPARTMENT INDICIES;
** CUMULATIVE IMPACT INDICIES;
* DCites = sum of total citation count among core faculty;
* Dh = sum of h-indices among core faculty;
* De = sum of e-indices;
* Dhm = sum of hm-indices;
* Dcumulative = average of above four after standardizing with mean of 0 and std of 1;
** CAREER STAGE ADJUSTED IMPACT INDICIES;
* DbyYcite = average of DbyYcite among core faculty;
* DbyYh = average of DbyYh among core faculty;
* DbyYe - same;
* DbyYhm - same;
* Dcareerstage = average of above four after standardizing with mean of 0 and std of 1;
** AGGREGATE INDICATOR;
* Daggregate = average of the cumulative and career-stage indices after standardizing
each with mean of 0 and std of 1;
* ALTERNATIVE INDICIES available that were not used in analysis;
mindex = hindex/ysincePhD; *Hirsch, 2005;
* WOStotal = Web of Science total citation count; *just used as reliability check for Publish or Perish count;
i = 1; *DUMMY VARIABLE;
if gender = "F" then gender = "f";
else if gender = "M" then gender = "m";
if ethnicity in ("W", "w") then ethnic = "w";
else if ethnicity in ('"', 'a', 'b', 'h', 'o', 'A', 'B', 'H', 'O') then ethnic = "o";
*breakdown by grad decade;
if PhDyear > 1999 then PhDdec = 2000;
else if PhDyear > 1989 then PhDdec = 1990;
else if PhDyear > 1979 then PhDdec = 1980;
else if PhDyear > 1969 then PhDdec = 1970;
else if PhDyear > 1959 then PhDdec = 1960;
else if PhDyear > 1949 then PhDdec = 1950;
run;
* basic information and demographics of sample;
proc contents;run;
proc means;run; *mean years since phd;
proc means n; class PhDdec; var i;run;
proc means n; class gender; var i;run;
proc means n; class ethnicity; var i;run;
proc means n; class ethnic; var i;run;
proc means n; class updated; var i;run;
* representation of universities and how many searches completed by each researcher;
proc means n; class univ; var i;run;
proc means n; class whosearched; var i;run;
* data for J. Schwartz illustration in intro;
proc univariate; var totalcites; where 1999 < PhDyear < 2006;run;
* overall correlations among basic indicies;
proc corr; var YsincePhD totalcites hindex eindex hmindex wostotal;run;
proc corr; var totalcites hindex eindex hmindex wostotal; partial YsincePhD; run;
proc means; var totalcites wostotal; where wostotal > -1;run;
proc corr alpha; var totalcites hindex eindex hmindex;run;
* plots of citation and h data by years since PhD illustrates heteroscedasticity;
proc reg ; model totalcites = YsincePhD /stb;run;
proc reg; model hindex = YsincePhD /stb;run;
proc reg; model eindex = YsincePhD /stb;run;
proc reg; model hmindex = YsincePhD /stb;run;
* threatens interpretation of regression estimates and comparability of deviation scores across career span;
* The log of citations and h was used and these non-linear adjustments eliminated these correlations;
* That suggests that the meaning of citeCS and hCS are comparable across the career span
* regressions to generate citation and h intercepts and slopes across the whole sample;
proc reg; model logcite = YsincePhD / r; run;
proc reg; model logh = YsincePhD / r;run;
proc reg; model loge = YsincePhD / r;run;
proc reg; model loghm = YsincePhD / r;run;
proc means; var logcite logh loge loghm;run;
* distribution analysis example for logcite;
filename grafout 'C:\primary\Data\Citations\';
ODS HTML PATH=GrafOut FILE="citationresiduals.html";
ODS GRAPHICS ON;
PROC REG DATA=citations PLOTS(UNPACKPANELS);
MODEL logcite = YsincePhD;
RUN;QUIT;
ODS GRAPHICS OFF;
ODS HTML CLOSE;
* illustrating averages for each year since PhD as comparison to regression estimates;
proc means; class YsincePhD; var totalcites;run;
proc means; class YsincePhD; var eindex;run;
proc means; class YsincePhD; var hindex;run;
proc means; class YsincePhD; var hmindex;run;
* refit regressions just for early career folks - <11 years since PhD;
proc reg; model logcite = YsincePhD; where YsincePhD < 11;run;
proc reg; model loge = YsincePhD;where YsincePhD < 11;run;
proc reg; model logh = YsincePhD;where YsincePhD < 11;run;
proc reg; model loghm = YsincePhD;where YsincePhD < 11;run;
* calculating deviations from regression line for career stage indicators;
data citations; set citations;
citeINT = 5.461;
citeSLOPE = .0803;
citeCS = logcite - (citeINT + YsincePhD*citeSLOPE); *deviation from expected citation value;
* for early career analysis - testing whether systematically more early-career researchers are
under the expected regression value;
if citeCS < 0 then citeEXP = -1; else citeEXP = 1;
hINT = 1.929;
hSLOPE = .0413;
hCS = logh - (hINT + YsincePhD*hSLOPE); *deviation from expected h value;
* for early career analysis;
if hCS < 0 then hEXP = -1; else hEXP = 1;
eINT = 2.505;
eSLOPE = .0383;
eCS = loge - (eINT + YsincePhD*eSLOPE); *deviation from expected e value;
* for early career analysis;
if eCS < 0 then eEXP = -1; else eEXP = 1;
hmINT = 1.220;
hmSLOPE = .0485;
hmCS = loghm - (hmINT + YsincePhD*hmSLOPE); *deviation from expected hm value;
* for early career analysis;
if hmCS < 0 then hmEXP = -1; else hmEXP = 1;
* simple correlations among key variables;
proc corr; var YsincePhD totalcites hindex citeCS hCS;run;
proc corr; var YsincePhD eindex hmindex eCS hmCS;run;
proc corr; var YsincePhD totalcites hindex citeCS hCS;
with eindex hmindex eCS hmCS;run;
* creating variables to retain raw citation and h variables before standardizing;
data citations; set citations;
RAWcites = totalcites;
RAWh = hindex;
RAWe = eindex;
RAWhm = hmindex;
* standardizing key variables;
proc standard mean=0 std=1 out=citations; var totalcites hindex eindex hmindex citecs hcs ecs hmcs;run;
* creating aggregate scores;
data citations; set citations;
Icareerstage = mean(citecs, hcs, ecs, hmcs);
Icumulative = mean(totalcites, hindex, eindex, hmindex);
*Iaggregate = mean(Icareerstage, Icumulative); * blend of both indicators, not used in article;
* means and corrs of all key variables;
proc means data=citations; run;
proc corr; var Icumulative Icareerstage;run;
proc corr; var Icumulative Icareerstage;
with totalcites hindex eindex hmindex citecs hcs ecs hmcs;run;
* ranking of cumulative scores;
proc sort; by Icumulative;
proc print; var nameinst Icumulative RAWcites RAWe RAWh RAWhm;run;
proc print; var nameinst Icumulative totalcites eindex hindex hmindex;run;
/*
proc print data=citations; var wholename Icumulative Icareerstage;run;
*/
* ranking of career-stage scores;
proc sort; by Icareerstage;
proc print; var nameinst Icareerstage citeCS eCS hCS hmCS;run;
* early career analysis - shows that early career estimates are off;
proc means; class citeEXP; var citeCS; where ysincephd < 6;run;
proc means; class eEXP; var eCS; where ysincephd < 6;run;
proc means; class hEXP; var hCS; where ysincephd < 6;run;
proc means; class hmEXP; var hmCS; where ysincephd < 6;run;
proc means; class citeEXP; var citeCS; where 5 < ysincephd < 11;run;
proc means; class eEXP; var eCS; where 5 < ysincephd < 11;run;
proc means; class hEXP; var hCS; where 5 < ysincephd < 11;run;
proc means; class hmEXP; var hmCS; where 5 < ysincephd < 11;run;
/*
proc contents;run;
data web.forlisa; set citations;
keep nameinst YsincePhD wholename univ RAWcites RAWh RAWe RAWhm Icumulative Icareerstage citeCS eCS hCS hmCS;
run;
** END OF BY INDIVIDUAL ANALYSES;
/*trying a multilevel model*/
proc mixed data=citations noclprint noinfo ord;
class univ;
model citeEXP eEXP hEXP hmEXP = /cl; *use corrb to see correlations between fixed effects;
random intercept / subject=univ cl; *should type=un be in this model;
*use 'cl' to see confidence intervals;
run;
** BY DEPARTMENT ANALYSES;
* creating by department variables;
proc means n mean data=citations; class univ; var citeCS; output out=D1; run;
proc means mean data=citations; class univ; var hCS; output out=D2; run;
proc means mean data=citations; class univ; var RAWcites; output out=D3; run;
proc means mean data=citations; class univ; var RAWh; output out=D4; run;
proc means mean data=citations; class univ; var YsincePhD; output out=D5; run;
proc means mean data=citations; class univ; var eCS; output out=D6; run;
proc means mean data=citations; class univ; var hmCS; output out=D7; run;
proc means mean data=citations; class univ; var RAWe; output out=D8; run;
proc means mean data=citations; class univ; var RAWhm; output out=D9; run;
data D1a; set D1; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D1b; set D1; if _STAT_ in ("N") then ; else delete;
deptN = citeCS; drop _type_ _freq_ _stat_ citeCS;
data D2; set D2; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D3; set D3; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D4; set D4; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D5; set D5; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D6; set D6; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D7; set D7; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D8; set D8; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data D9; set D9; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_;
data Dcitations; merge D1a D1b D2 D3 D4 D5 D6 D7 D8 D9; by univ;
if univ = "" then delete;
Drawcites = deptN*RAWcites;
Drawh = deptN*RAWh;
Drawe = deptN*RAWe;
Drawhm = deptN*RAWhm;
Dyears = YsincePhD; drop ysincePhD;
Dcites = Drawcites;
Dh = Drawh;
De = Drawe;
Dhm = Drawhm;
run;
* standardizing key variables;
proc standard mean=0 std=1 out=Dcitations; var Dcites Dh De Dhm citeCS hCS eCS hmCS;run;
* creating aggrgated variables;
data Dcitations; set Dcitations;
DciteCS = citeCS;
DhCS = hCS;
DeCS = eCS;
DhmCS = hmCS; drop citeCS hCS eCS hmCS;
Dcumulative = mean(Dcites, Dh, De, Dhm);
Dcareerstage = mean(DciteCS, DhCS, DeCS, DhmCS);
Daggregate = mean(Dcumulative, Dcareerstage);
run;
* basic means and correlations of key variables;
proc means;run;
proc corr data=Dcitations; var deptN Dyears Dcumulative Drawcites Drawh Drawe Drawhm;run;
proc corr data=Dcitations; var deptN Dyears Dcareerstage DciteCS DhCS DeCS DhmCS;run;
proc corr data=Dcitations; var deptN Dyears Dcumulative Drawcites Drawh Drawe Drawhm;
with deptN Dyears Dcareerstage DciteCS DhCS DeCS DhmCS;run;
* cumulative rankings;
proc sort; by Dcumulative;
proc print; var univ deptN Dyears Dcumulative Dcites De Dh Dhm; run;
proc print; var univ deptN Dyears Dcumulative Drawcites Drawe Drawh Drawhm; run;
* how much does size and seniority of department account for Dcumulative ranks?;
proc reg data=Dcitations; model Dcumulative = deptN Dyears;run;
/*
*what does it look like with average cites and h rather than cumulative (sum)?;
proc standard mean=0 std=1 out=Dcitations; var rawcites rawh rawe rawhm;run;
data Dcitations; set Dcitations;
Davgcum = mean(rawcites, rawh, rawe, rawhm);
proc corr; var deptN Dyears Davgcum rawcites rawh rawe rawhm;run;
proc sort; by Davgcum;
proc print; var univ deptN Dyears Davgcum rawcites rawe rawh rawhm; run;
*end of "average" footnote analysis;
*/
* career stage rankings;
proc sort; by Dcareerstage;
proc print; var univ deptN Dyears Dcareerstage DciteCS DeCS DhCS DhmCS; run;
* how much does size and seniority of department account for Dcareerstage ranks?;
proc reg data=Dcitations; model Dcareerstage = deptN Dyears;run;
* aggregate of cumulative and career stage and rankings;
proc sort; by Daggregate;
proc print; var univ deptN Dyears Daggregate Dcumulative Dcareerstage; run;
*mizzou and rochester focus - for general discussion;
proc print data=citations; var wholename Icumulative Icareerstage;
where univ = "University of Missouri";run;
proc print data=citations; var wholename Icumulative Icareerstage;
where univ = "University of Rochester";run;
** END OF BY DEPARTMENT ANALYSES;
** GENDER AND RACE/ETHNICITY ANALYSES;
* bring individual dataset back into focus;
data citations; set citations;
* Gender and Ethnicity mean comparisons;
proc means; class gender; var ysincePhD Icumulative Icareerstage ;run;
proc means; class ethnic; var ysincePhD Icumulative Icareerstage ;run;
proc means; class PhDdec gender; var Icareerstage; run;
proc means; class PhDdec ethnic; var Icareerstage; run;
* two regression strategies for testing gender and race/ethnicity differences - and contribution of years since PhD;
proc glm; class gender ethnic; model Icumulative = gender ethnic;run;
proc glm; class gender ethnic; model Icumulative = gender ethnic ysincephd;run;
proc glm; class gender ethnic; model Icareerstage = gender ethnic;run;