readstata13/0000755000176200001440000000000013103134150012355 5ustar liggesusersreadstata13/inst/0000755000176200001440000000000013077075424013354 5ustar liggesusersreadstata13/inst/extdata/0000755000176200001440000000000013077075572015012 5ustar liggesusersreadstata13/inst/extdata/missings.dta0000644000176200001440000000242713077075424017341 0ustar liggesusers
118LSF11 Jul 2016 23:28
<7Cf missings%9.0g=  K K =?d=  =KB=KB L L?d    KA==K (08@HPX`hpx
readstata13/inst/extdata/underscore.do0000644000176200001440000000020213077075424017475 0ustar liggesusersclear all set obs 2 gen v_1 = _n gen v_2 = _n gen long_name_multiple_underscores = _n compress save "underscore.dta", replace readstata13/inst/extdata/statacar.dta0000644000176200001440000002751713077075424017316 0ustar liggesusers
118LSF  6 Sep 2016 14:04
P UC),*-C/O/ide20brandsmodeltypehpdroommaxknmileageecarhldateg_circleldatecalmodelStrL%8.0g%8.0gc%8.0g%8.0g%20sc%8.0g%8.0gc%8.0g%%20s%8.0g%6.2f%8.0g%13.0g%8.0g%10.0g%9.0g%8.0g%tbsp500%tdg%9sgakenakeodeltype_enurationivisionivisionivisionriginriginNumeric IDXKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKBrand of carXKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKCar modelarXKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKCar classificationKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKHorse PowercationKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKMaximum speedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKaximum speedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKaximum speedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKLaunch datedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKLaunch date (calendar)KXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKaunch date (calendar)KXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLK_dta⥂__YZ@@PE`_E` _غZZ@|[_lang_cen_lang_v_en ^ZX ^Zenldatecal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Einführungsdatum (Kalender)ldateal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Einführungsdatummaxeal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Höchstgeschwindigkeithpeal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Pferdestärken typeal__YZ@@PE`_E` _غZZ@|[_lang_l_de!"#$%&'()*+,-./:;<=>?@[\]^type_detypeal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Klassifikation modelal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Automodellbrandal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Herstellermarkeidndal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Numerische ID_dtaal__YZ@@PE`_E` _غZZ@|[_lang_listRpj0Rpj2en deMeyerordSpeed Start 2000e@f@33#AjF MeyerrdHappy Family!@b b@33@oF AkikoitSusumu 1 #yE@-̬]@XjF AkikonturySusumu 3 (;@P33333_@@mF HutchectraLumberjack 3000q=@fffffc@33cAnlF EriksonbreE-Car 2000+R.@d@lF EriksonreeMaxinator"07@Y#~dtjF EriksonleEMimizer *;@#jF GSO Speed Start 2000GSO  Happy FamilyGSO  Susumu 1GSO  Susumu 3GSO Lumberjack 3000GSO  E-Car 2000GSO  MaxinatorGSO Mimizerptype_deXHLX'' 84#0GeländewagenSportwagenStadtautoFamilienautomaxminftype_enXHLX'' .* &Off-RoadRoadsterCity carFamily carmaxmin
readstata13/inst/extdata/sp500.stbcal0000644000176200001440000000073613077075424017055 0ustar liggesusers* Business calendar "sp500" created by -bcal create- * Created/replaced on 18 Nov 2014 version 12.1 purpose "S&P 500 for 2001" dateformat ymd range 2001jan02 2001dec31 centerdate 2001jan02 omit dayofweek (Sa Su) omit date 2001jan15 omit date 2001feb19 omit date 2001apr13 omit date 2001may28 omit date 2001jul04 omit date 2001sep03 omit date 2001sep11 omit date 2001sep12 omit date 2001sep13 omit date 2001sep14 omit date 2001nov22 omit date 2001dec25 readstata13/inst/extdata/missings_msf.dta0000644000176200001440000000256613077075424020212 0ustar liggesusers
117MSF05 Sep 2016 20:29
 I >Mjvbins%9.0g%9.0g%9.0g%2s?1
readstata13/inst/extdata/statacar.do0000644000176200001440000000363713077075424017145 0ustar liggesusers clear all input int(id) str20 brand str20 model long(type) int(hp) double(max) float(mileage) byte(ecar) long(ldate) str20(ldatecal) 1 "Meyer" "Speed Start 2000" 2 150 176.5 10.2 0 1 2001-01-03 2 "Meyer" "Happy Family" 4 98 145 5.6 0 247 2001-12-31 3 "Akiko" "Susumu 1" 3 45 118.7 -1 0 14 2001-01-23 4 "Akiko" "Susumu 3" 4 80 127.3 6.8 0 134 2001-07-16 5 "Hutch" "Lumberjack 3000" 1 180 156.2 14.2 0 110 2001-06-11 6 "Erikson" "E-Car 2000" 3 . . -2 1 100 2001-05-25 7 "Erikson" "Maxinator" 2147483620 32740 8.988e+307 1.701e+38 100 19 2001-01-30 7 "Erikson" "Mimizer" -2147483647 -32767 -1.798e+308 -1.701e+38 -127 1 2001-01-03 end gen ldatecal2 = date(ldatecal, "YMD") generate strL modelStrL = model drop ldatecal rename ldatecal2 ldatecal // bcal uses a special format. // %tb for business calendar and following the calendar name format ldatecal %td format ldate %tbsp500 // missings replace mileage = .a if mileage ==-1 // no info replace mileage = .b if mileage ==-2 // not applicable // Label en label language en, rename label var id "Numeric ID" label var brand "Brand of car" label var type "Car classification" label var model "Car model" label var hp "Horse Power" label var max "Maximum speed" label var ldate "Launch date" label var ldatecal "Launch date (calendar)" label define type_en 1 "Off-Road" 2 "Roadster" 3 "City car" 4 "Family car" 2147483620 "max" -2147483647 "min", modify label value type type_en // Label de label language de, new label var id "Numerische ID" label var brand "Herstellermarke" label var type "Klassifikation" label var model "Automodell" label var hp "Pferdestrken" label var max "Hchstgeschwindigkeit" label var ldate "Einfhrungsdatum" label var ldatecal "Einfhrungsdatum (Kalender)" label define type_de 1 "Gelndewagen" 2 "Sportwagen" 3 "Stadtauto" 4 "Familienauto" 2147483620 "max" -2147483647 "min", modify label value type type_de label language en save "statacar.dta", replace readstata13/inst/extdata/encodecp.dta0000644000176200001440000000121713077075424017261 0ustar liggesusersst9;9t9ߖJ 1 Sep 2016 17:16numchr%8.0g%9snumlabeltmp/sd04321.000000"cp.dta"acter vtmp/sd04321.000000"cp.dta"acter vEUROEGnumlabel;9t9 EUROEreadstata13/inst/extdata/encode.do0000644000176200001440000000037113077075424016570 0ustar liggesusersclear all set obs 6 gen int num = _n label variable num äöüß label define numlabel 1 "ä" 2 "ö" 3 "ü" 4 "ß" 5 "€" 6 "Œ" label values num numlabel // create character variable from labels decode num, gen(chr) save "encode.dta", replace readstata13/inst/extdata/missings.do0000644000176200001440000000057713077075424017177 0ustar liggesusersclear all set obs 27 gen missing = _n mvdecode missing, mv( 1 = . \ 2 = .a \ 3 = .b \ 4 = .c \ 5 = .d \ 6 = .e \ 7 = .f \ /// 8 = .g \ 9 = .h \ 10 = .i \ 11 = .j \ 12 = .k \ 13 = .l \ 14 = .m \ /// 15 = .n \ 16 = .o \ 17 = .p \ 18 = .q \ 19 = .r \ 20 = .s \ 21 = .t \ /// 22 = .u \ 23 = .v \ 24 = .w \ 25 = .x \ 26 = .y \ 27 = .z ) save "missings.dta", replace readstata13/inst/extdata/gen_fac.do0000644000176200001440000000016613077075424016717 0ustar liggesusersclear all set obs 2 gen v1 = _n label define v1 1 "one" label values v1 v1 compress save "gen_fac.dta", replace readstata13/inst/extdata/underscore.dta0000644000176200001440000000466113077075424017660 0ustar liggesusers
118LSF 1 Sep 2016 15:07
@]C f y v_1_name_multiple_underscoresv_2_name_multiple_underscoreslong_name_multiple_underscores%9.0g%9.0g%9.0go==K=nK=oPo==oՐKp?oKp?qLXL  ==ՐK?ooKo==K=nK=oPo==oՐKp?oKp?qLXL  ==ՐK?ooKo==K=nK=oPo==oՐKp?oKp?qLXL  ==ՐK?ooK
readstata13/inst/extdata/encode.dta0000644000176200001440000000404713077075424016742 0ustar liggesusers
118LSF 1 Sep 2016 17:13
>Up 'numchr%8.0g%9snumlabeläöüßcVpcVKpcVnKpcV::pcVpcV:ՐK:KqLXL  pcVpcVՐK::KäöüßcVpcVKpcVnKpcV::pcVpcV:ՐK:KqLXL  pcVpcVՐK::Käöü߀ŒKnumlabelpcVpcVՐK:pcV äöü߀Œ
readstata13/inst/extdata/nonint.dta0000644000176200001440000000060713077075424017010 0ustar liggesuserssc`-g12 Jul 2016 00:54v1%10.0gv1c&D0cpD[\]^?333333?v1pJ/ponereadstata13/inst/extdata/test.zip0000644000176200001440000000370513077075572016522 0ustar liggesusersPKx_JݳO/ statacar.dtaUT {X{Xux dZ[hU4^4*1f2Nǭ5m$izvtgvgfFxyPbAPj"*ZDTDlPAQ,'mEk6f??B"bdcb[g1R! 02ˬ1l.C6mf `LE)욌_Fx6m6 ZoַjC/K ]⡂)UaIo :*fAo^(W unYwVy+{\tlg{qPpl:LlߥJg& &KG c8DRm3mF=(ll^dxh?#a-` F?6 1dC,7 %~`,$#Q{i+OB B‘ix( uof~OxOGu™5ˏ8 軄}9x\sa*(0 Е)[I>J /v40" W"#C#u"58'FlnQ=׎o`n+z8v}SїUͫ04r;a0]Bu:IU:? %URQ=MbΦ;&ye+{]֞SkQTs^fug&D48g-v,1 C> 6h=-(`lCYuF *歬w^r 8o#,pO. +B B?&j;U_ޚ:BJr(4!Ptt1uukO DIC@-gm*、m/?t4#Ch0D*"Edo0q/nۗ-_r=zYpzcyO ᡇy :/ :.V8U XU`U`gcX sa',G4&6co\e(nD:^4W,:Ͱɉit{-(edc|Ff `!ݔi86sxA0; E͖]^vp -WYoEPoX3 d`+|bdT|z=m,D$8t3eYtb`k0eҔR6B¸y;NĞ#%i)͘}LsN:%?\K-+E +k|sI:kY2_(ˇE2̲o;8,ܲQ`dW~JsU& lqe;d7rlφ f9(.Yǔ5KR@*m7ͣ'. XkJN4>ur]KOKaF B`Glu'#&2u }UР"k!(aWm8*c*E*1(-Pܔk< &ҧ6'fVZ7J*/:Bjɳt}&cṇ`؛texP~t^ws<͆X9i %Y)y>7`{:ݳG6 lrq~,ܖ\v|?PKx_JݳO/ statacar.dtaUT{Xux dPKR]readstata13/inst/extdata/missings_lsf.dta0000644000176200001440000000256613077075424020211 0ustar liggesusers
117LSF05 Sep 2016 20:27
I >Mjvbins%9.0g%9.0g%9.0g%2s?1
readstata13/inst/extdata/gen_fac.dta0000644000176200001440000000252413077075424017065 0ustar liggesusers
118LSF12 Jul 2016 00:22
<7CfuHTv1%9.0gv1?c`)`)K`)K`)?c?d?c`)`)?cKPO?cKPO L L?d  `)`)K"?c?cKv1K?d`)?d`)K?c`)one
readstata13/inst/extdata/nonint.do0000644000176200001440000000020413077075424016633 0ustar liggesusersclear all set obs 2 gen double v1 = _n recode v1 2 = 1.2 label define v1 1 "one" label values v1 v1 save "nonint.dta", replace readstata13/inst/include/0000755000176200001440000000000013103030636014761 5ustar liggesusersreadstata13/inst/include/read_pre13_dta.h0000644000176200001440000000155413103030636017714 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef READPRE13DTA_H #define READPRE13DTA_H Rcpp::List read_pre13_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows); #endif readstata13/inst/include/readstata.h0000644000176200001440000000617513077075424017131 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef READSTATA_H #define READSTATA_H #include #include #include #include #include #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) /* Test for GCC < 4.9.0 */ #if GCC_VERSION < 40900 & !__clang__ typedef signed char int8_t; typedef unsigned char uint8_t; typedef signed short int16_t; typedef unsigned short uint16_t; typedef signed int int32_t; typedef unsigned int uint32_t; #else #include #endif #include "read_dta.h" #include "read_pre13_dta.h" #include "statadefines.h" #include "swap_endian.h" template T readbin( T t , FILE * file, bool swapit) { if (fread(&t, sizeof(t), 1, file) != 1) { if (feof(file)) return 0; // this is expected after reading the labeltable } else if (ferror(file)){ Rcpp::warning("num: a binary read error occurred."); } if (swapit==0) return(t); else return(swap_endian(t)); } template T readuint48( T t , FILE * file, bool swapit) { char uint48[6]; if (fread(uint48, sizeof(uint48), 1, file) != 1) { if (feof(file)) return 0; // this is expected after reading the labeltable } else if (ferror(file)){ Rcpp::warning("num: a binary read error occurred."); } t = *(uint64_t *)&uint48; if (swapit==0) return(t); else return(swap_endian(t)); } static void readstring(std::string &mystring, FILE * fp, int nchar) { if (!fread(&mystring[0], nchar, 1, fp)) Rcpp::warning("char: a binary read error occurred"); } inline void test(std::string testme, FILE * file) { std::string test(testme.size(), '\0'); readstring(test,file, test.size()); if (testme.compare(test)!=0) { fclose(file); Rcpp::warning("\n testme:%s \n test: %s\n", testme.c_str(), test.c_str()); Rcpp::stop("When attempting to read %s: Something went wrong!", testme.c_str()); } } template static void writebin(T t, std::fstream& dta, bool swapit) { if (swapit==1){ T t_s = swap_endian(t); dta.write((char*)&t_s, sizeof(t_s)); } else { dta.write((char*)&t, sizeof(t)); } } template static void writestr(std::string val_s, T len, std::fstream& dta) { std::stringstream val_stream; val_stream << std::left << std::setw(len) << std::setfill('\0') << val_s; std::string val_strl = val_stream.str(); dta.write(val_strl.c_str(),val_strl.length()); } #endif readstata13/inst/include/read_dta.h0000644000176200001440000000153413103030636016700 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef READDTA_H #define READDTA_H Rcpp::List read_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows); #endif readstata13/inst/include/swap_endian.h0000644000176200001440000000224213077075424017440 0ustar liggesusers#ifndef SWAP_ENDIAN #define SWAP_ENDIAN /*#include */ #include #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) /* Test for GCC < 4.8.0 */ #if GCC_VERSION < 40800 & !__clang__ static inline unsigned short __builtin_bswap16(unsigned short a) { return (a<<8)|(a>>8); } #endif template T swap_endian(T t) { if (typeid(T) == typeid(int16_t)) return __builtin_bswap16(t); if (typeid(T) == typeid(uint16_t)) return __builtin_bswap16(t); if (typeid(T) == typeid(int32_t)) return __builtin_bswap32(t); if (typeid(T) == typeid(uint32_t)) return __builtin_bswap32(t); if (typeid(T) == typeid(int64_t)) return __builtin_bswap64(t); if (typeid(T) == typeid(uint64_t)) return __builtin_bswap64(t); union v { double d; float f; uint32_t i32; uint64_t i64; } val; if (typeid(T) == typeid(float)){ val.f = t; val.i32 = __builtin_bswap32(val.i32); return val.f; } if (typeid(T) == typeid(double)){ val.d = t; val.i64 = __builtin_bswap64(val.i64); return val.d; } else return t; } #endif readstata13/inst/include/statadefines.h0000644000176200001440000000436713077075424017634 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef STATADEFINES #define STATADEFINES /* Test for a little-endian machine */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define sbyteorder "LSF" #define SBYTEORDER 2 #else #define sbyteorder "MSF" #define SBYTEORDER 1 #endif #define swapit FALSE /*Define missings*/ #define STATA_BYTE_NA_MIN -127 #define STATA_BYTE_NA_MAX +100 #define STATA_BYTE_NA +101 #define STATA_BYTE_NA_104 +127 // guess. #define STATA_SHORTINT_NA_MIN -32767 #define STATA_SHORTINT_NA_MAX +32740 #define STATA_SHORTINT_NA +32741 #define STATA_INT_NA_MIN -2147483647 #define STATA_INT_NA_MAX +2147483620 #define STATA_INT_NA +2147483621 #define STATA_INT_NA_108 2147483647 #define STATA_FLOAT_NA_MAX (1+15/pow(16.0,1)+15/pow(16.0,2)+15/pow(16.0,3)+15/pow(16.0,4)+15/pow(16.0,5)+14/pow(16.0,6))*pow(2.0,126) #define STATA_FLOAT_NA_MIN -STATA_FLOAT_NA_MAX #define STATA_FLOAT_NA 1+pow(2.0,127) #define STATA_DOUBLE_NA_MAX (1+15/pow(16.0,1)+15/pow(16.0,2)+15/pow(16.0,3)+15/pow(16.0,4)+15/pow(16.0,5)+15/pow(16.0,6)+15/pow(16.0,7)+15/pow(16.0,8)+15/pow(16.0,9)+15/pow(16.0,10)+15/pow(16.0,11)+15/pow(16.0,12)+15/pow(16.0,13))*pow(2.0,1022) #define STATA_DOUBLE_NA_MIN -1*(1+15/pow(16.0,1)+15/pow(16.0,2)+15/pow(16.0,3)+15/pow(16.0,4)+15/pow(16.0,5)+15/pow(16.0,6)+15/pow(16.0,7)+15/pow(16.0,8)+15/pow(16.0,9)+15/pow(16.0,10)+15/pow(16.0,11)+15/pow(16.0,12)+15/pow(16.0,13))*pow(2.0,1023) #define STATA_DOUBLE_NA pow(2.0,1023) #define STATA_BYTE 65530 #define STATA_SHORTINT 65529 #define STATA_INT 65528 #define STATA_FLOAT 65527 #define STATA_DOUBLE 65526 #define STATA_STRL 32768 #endif readstata13/tests/0000755000176200001440000000000013077075424013541 5ustar liggesusersreadstata13/tests/testthat.R0000644000176200001440000000010213077075424015515 0ustar liggesuserslibrary(testthat) library(readstata13) test_check("readstata13") readstata13/tests/testthat/0000755000176200001440000000000013103134150015357 5ustar liggesusersreadstata13/tests/testthat/test_read.R0000644000176200001440000001133613077075424017502 0ustar liggesuserslibrary(readstata13) context("Reading datasets") datacompare <- function(x, y) { res <- unlist(Map(all.equal, x, y)) # with all(unlist(res)) if not TRUE, a warning is thrown res <- all(unlist(lapply(res, isTRUE))) res } #### missings #### # missings.do creates missings.dta # missings.dta contains variable missings containing ., .a, .b, ..., .z # # Note: prior Stata 8 there was only a single missing value dd <- data.frame(missings = as.numeric(rep(NA, 27))) missings <- system.file("extdata", "missings.dta", package="readstata13") dd118 <- read.dta13(missings, missing.type = FALSE) dd118_m <- read.dta13(missings, missing.type = TRUE) mvals <- attr(dd118_m, "missing")$missings test_that("missings", { expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd118_m)) expect_identical(mvals, as.numeric(0:26)) }) # rm(list = files) #### missings msf/lsf #### dd <- data.frame(b = as.logical(c(1,NA)), i=as.integer(c(1,NA)), n=as.numeric(c(1,NA)), s=c("1", ""), stringsAsFactors = FALSE) dd$b <- as.integer(dd$b) missings_msf <- system.file("extdata", "missings_msf.dta", package="readstata13") missings_lsf <- system.file("extdata", "missings_lsf.dta", package="readstata13") dd_msf <- read.dta13(missings_msf) dd_lsf <- read.dta13(missings_lsf) test_that("missings msf/lsf", { expect_true(datacompare(dd, dd_msf)) expect_true(datacompare(dd, dd_lsf)) }) #### generate factors TRUE #### dd <- data.frame(v1 = as.numeric(1:2)) dd$v1 <- factor(x = dd$v1, levels = 1:2, labels = c("one", "2")) gen_fac <- system.file("extdata", "gen_fac.dta", package="readstata13") dd118 <- read.dta13(gen_fac, convert.factors = TRUE, generate.factors = TRUE) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) # rm(list = files) #### generate factors FALSE #### dd <- data.frame(v1 = as.numeric(1:2)) gen_fac <- system.file("extdata", "gen_fac.dta", package="readstata13") suppressWarnings(dd118 <- read.dta13(gen_fac, convert.factors = TRUE, generate.factors = FALSE)) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) #### convert.underscore = TRUE #### dd <- data.frame(v.1 = as.numeric(1:2), v.2 = as.numeric(1:2), long.name.multiple.underscores = as.numeric(1:2)) underscore <- system.file("extdata", "underscore.dta", package="readstata13") dd118 <- read.dta13(underscore, convert.underscore = T) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) #### convert.underscore = FALSE #### dd <- data.frame(v.1 = as.numeric(1:2), v.2 = as.numeric(1:2), long_name_multiple_underscores = as.numeric(1:2)) underscore <- system.file("extdata", "underscore.dta", package="readstata13") dd118 <- read.dta13(underscore, convert.underscore = F) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) #### noint.factors TRUE #### dd <- data.frame(v1 = as.numeric(1:2)) dd$v1 <- factor(x = dd$v1, levels = 1:2, labels = c("one", "1.2")) nonint <- system.file("extdata", "nonint.dta", package="readstata13") dd118 <- read.dta13(nonint, convert.factors = TRUE, generate.factors = TRUE, nonint.factors = TRUE) test_that("nonint.factors TRUE", { expect_true(datacompare(dd, dd118)) }) # rm(list = files) #### encoding TRUE #### umlauts <- c("ä","ö","ü","ß","€","Œ") Encoding(umlauts) <- "UTF-8" ddcp <- dd <- data.frame(num = factor(1:6, levels = 1:6, labels = umlauts), chr = umlauts, stringsAsFactors = FALSE) # Dataset in CP1252 levels(ddcp$num)[5:6] <- c("EUR","OE") ddcp$chr[5:6] <- c("EUR","OE") # Stata 14 encode <- system.file("extdata", "encode.dta", package="readstata13") # Stata 12 encodecp <- system.file("extdata", "encodecp.dta", package="readstata13") ddutf_aE <- read.dta13(encode, convert.factors = TRUE, generate.factors = TRUE, encoding="UTF-8") # On windows the last two characters will fail on default (not in latin1) dd_aE <- read.dta13(encode, convert.factors = TRUE, generate.factors = TRUE) ddcp_aE <- read.dta13(encodecp, convert.factors = TRUE, generate.factors = TRUE) test_that("encoding CP1252", { expect_true(datacompare(ddcp, ddcp_aE)) }) test_that("encoding UTF-8 (Stata 14)", { expect_true(datacompare(dd$chr[1:4], dd_aE$chr[1:4])) expect_true(datacompare(dd, ddutf_aE)) }) test_that("Reading of strls", { strl <- system.file("extdata", "statacar.dta", package="readstata13") ddstrlf <- read.dta13(strl, replace.strl = F) ddstrlfref <- paste0("11_", 1:8) expect_equal(ddstrlf$modelStrL, ddstrlfref) ddstrl <- read.dta13(strl, replace.strl = T) expect_equal(ddstrl$model, ddstrl$modelStrL) }) readstata13/tests/testthat/test_save.R0000644000176200001440000007322613103030636017515 0ustar liggesuserslibrary(readstata13) context("Saving datasets") # ToDo: Fix this. # load(system.file("extdata/statacar.RData", package="readstata13")) # # saveandload <- function(x, ...) { # file <- tempfile(pattern="readstata13_", fileext=".dta") # save.dta13(x, file=file, ...) # all(unlist(Map(identical, x, read.dta13(file)))) # } # # test_that("Saved file is identical: Version 118", { # expect_true(saveandload(statacar, version="118", convert.factors=T)) # }) datacompare <- function(x, y) { all(unlist(Map(all.equal, x, y))) } namescompare <- function(x, y){ all(identical(names(x), names(y))) } files <- c("dd118", "dd117", "dd115", "dd114", "dd113", "dd112", "dd111", "dd110", "dd108", "dd107", "dd106", "dd105", "dd104", "dd103", "dd102", "dd") data(mtcars) #### version #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") # rm -r unlink("data", recursive = TRUE) test_that("version", { expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### compress #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_118.dta", version = 118, compress = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, compress = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, compress = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, compress = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, compress = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, compress = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, compress = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, compress = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, compress = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, compress = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, compress = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, compress = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, compress = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, compress = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, compress = TRUE) dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") # rm -r unlink("data", recursive = TRUE) test_that("compress", { expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### convert.factors TRUE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man")) save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.factors = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.factors = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.factors = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.factors = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.factors = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.factors = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = TRUE) # save.dta13(dd, "data/dta_106.dta", version = 106, convert.factors = TRUE) # save.dta13(dd, "data/dta_105.dta", version = 105, convert.factors = TRUE) # save.dta13(dd, "data/dta_104.dta", version = 104, convert.factors = TRUE) # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = TRUE) # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = TRUE) dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") # dd106 <- read.dta13("data/dta_106.dta") # dd105 <- read.dta13("data/dta_105.dta") no factors # dd104 <- read.dta13("data/dta_104.dta") # dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") # rm -r unlink("data", recursive = TRUE) test_that("convert.factors TRUE", { expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) # expect_true(datacompare(dd, dd106)) # expect_true(datacompare(dd, dd105)) no factors # expect_true(datacompare(dd, dd104)) # expect_true(datacompare(dd, dd103)) # expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### convert.factors FALSE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man")) save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = FALSE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = FALSE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = FALSE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.factors = FALSE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.factors = FALSE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.factors = FALSE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.factors = FALSE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.factors = FALSE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.factors = FALSE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = FALSE) # save.dta13(dd, "data/dta_106.dta", version = 106, convert.factors = FALSE) # save.dta13(dd, "data/dta_105.dta", version = 105, convert.factors = FALSE) # no factors | expect_warning ? # save.dta13(dd, "data/dta_104.dta", version = 104, convert.factors = FALSE) # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = FALSE) # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = FALSE) dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") # dd106 <- read.dta13("data/dta_106.dta") # dd105 <- read.dta13("data/dta_105.dta") no factors | expect_warning ? # dd104 <- read.dta13("data/dta_104.dta") # dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") # add one (because of stupid factor) dd <- mtcars dd$am <- dd$am + 1 # rm -r unlink("data", recursive = TRUE) test_that("convert.factors TRUE", { expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) # expect_true(datacompare(dd, dd106)) # expect_true(datacompare(dd, dd105)) no factors # expect_true(datacompare(dd, dd104)) # expect_true(datacompare(dd, dd103)) # expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### add rownames TRUE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_118.dta", version = 118, add.rownames = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, add.rownames = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, add.rownames = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, add.rownames = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, add.rownames = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, add.rownames = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, add.rownames = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, add.rownames = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, add.rownames = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, add.rownames = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, add.rownames = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, add.rownames = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, add.rownames = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, add.rownames = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, add.rownames = TRUE) dd118 <- read.dta13("data/dta_118.dta", add.rownames = TRUE) dd117 <- read.dta13("data/dta_117.dta", add.rownames = TRUE) dd115 <- read.dta13("data/dta_115.dta", add.rownames = TRUE) dd114 <- read.dta13("data/dta_114.dta", add.rownames = TRUE) dd113 <- read.dta13("data/dta_113.dta", add.rownames = TRUE) dd112 <- read.dta13("data/dta_112.dta", add.rownames = TRUE) dd111 <- read.dta13("data/dta_111.dta", add.rownames = TRUE) dd110 <- read.dta13("data/dta_110.dta", add.rownames = TRUE) dd108 <- read.dta13("data/dta_108.dta", add.rownames = TRUE) dd107 <- read.dta13("data/dta_107.dta", add.rownames = TRUE) dd106 <- read.dta13("data/dta_106.dta", add.rownames = TRUE) dd105 <- read.dta13("data/dta_105.dta", add.rownames = TRUE) dd104 <- read.dta13("data/dta_104.dta", add.rownames = TRUE) dd103 <- read.dta13("data/dta_103.dta", add.rownames = TRUE) dd102 <- read.dta13("data/dta_102.dta", add.rownames = TRUE) # rm -r unlink("data", recursive = TRUE) test_that("add.rownames TRUE", { # Check that rownames are identical expect_true(identical(rownames(dd), rownames(dd118))) expect_true(identical(rownames(dd), rownames(dd117))) expect_true(identical(rownames(dd), rownames(dd115))) expect_true(identical(rownames(dd), rownames(dd114))) expect_true(identical(rownames(dd), rownames(dd113))) expect_true(identical(rownames(dd), rownames(dd112))) expect_true(identical(rownames(dd), rownames(dd111))) expect_true(identical(rownames(dd), rownames(dd110))) expect_true(identical(rownames(dd), rownames(dd108))) expect_true(identical(rownames(dd), rownames(dd107))) expect_true(identical(rownames(dd), rownames(dd106))) expect_true(identical(rownames(dd), rownames(dd105))) expect_true(identical(rownames(dd), rownames(dd104))) expect_true(identical(rownames(dd), rownames(dd103))) expect_true(identical(rownames(dd), rownames(dd102))) # Check that data is identical expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### data label TRUE #### dl <- "mtcars data file" if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_118.dta", version = 118, data.label = dl) save.dta13(dd, "data/dta_117.dta", version = 117, data.label = dl) save.dta13(dd, "data/dta_115.dta", version = 115, data.label = dl) save.dta13(dd, "data/dta_114.dta", version = 114, data.label = dl) save.dta13(dd, "data/dta_113.dta", version = 113, data.label = dl) save.dta13(dd, "data/dta_112.dta", version = 112, data.label = dl) save.dta13(dd, "data/dta_111.dta", version = 111, data.label = dl) save.dta13(dd, "data/dta_110.dta", version = 110, data.label = dl) save.dta13(dd, "data/dta_108.dta", version = 108, data.label = dl) save.dta13(dd, "data/dta_107.dta", version = 107, data.label = dl) save.dta13(dd, "data/dta_106.dta", version = 106, data.label = dl) save.dta13(dd, "data/dta_105.dta", version = 105, data.label = dl) save.dta13(dd, "data/dta_104.dta", version = 104, data.label = dl) save.dta13(dd, "data/dta_103.dta", version = 103, data.label = dl) # save.dta13(dd, "data/dta_102.dta", version = 102, data.label = dl) # no data label dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("data label", { # Check that rownames are identical expect_equal(dl, attr(dd118, "datalabel")) expect_equal(dl, attr(dd117, "datalabel")) expect_equal(dl, attr(dd115, "datalabel")) expect_equal(dl, attr(dd114, "datalabel")) expect_equal(dl, attr(dd113, "datalabel")) expect_equal(dl, attr(dd112, "datalabel")) expect_equal(dl, attr(dd111, "datalabel")) expect_equal(dl, attr(dd110, "datalabel")) expect_equal(dl, attr(dd108, "datalabel")) expect_equal(dl, attr(dd107, "datalabel")) expect_equal(dl, attr(dd106, "datalabel")) expect_equal(dl, attr(dd105, "datalabel")) expect_equal(dl, attr(dd104, "datalabel")) expect_equal(dl, attr(dd103, "datalabel")) # expect_equal(dl, attr(dd102, "datalabel")) }) # rm(list = files) #### convert dates TRUE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- data.frame( dat = Sys.Date() ) save.dta13(dd, "data/dta_118.dta", version = 118, convert.dates = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.dates = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.dates = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.dates = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.dates = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.dates = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.dates = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.dates = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.dates = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.dates = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, convert.dates = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, convert.dates = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, convert.dates = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, convert.dates = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, convert.dates = TRUE) dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("convert.dates TRUE", { # Check that rownames are identical expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### strl save #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") # strLs can be of length any length up to 2 billion characters. Starting with # 2046 a string is handled as a strL dd <- data.frame( dat = c(paste(replicate(2046, "a"), collapse = ""), paste(replicate(2046, "b"), collapse = "")), stringsAsFactors = FALSE) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) # save.dta13(dd, "data/dta_115.dta", version = 115) # no strl # save.dta13(dd, "data/dta_114.dta", version = 114) # save.dta13(dd, "data/dta_113.dta", version = 113) # save.dta13(dd, "data/dta_112.dta", version = 112) # save.dta13(dd, "data/dta_111.dta", version = 111) # save.dta13(dd, "data/dta_110.dta", version = 110) # save.dta13(dd, "data/dta_108.dta", version = 108) # save.dta13(dd, "data/dta_107.dta", version = 107) # save.dta13(dd, "data/dta_106.dta", version = 106) # save.dta13(dd, "data/dta_105.dta", version = 105) # save.dta13(dd, "data/dta_104.dta", version = 104) # save.dta13(dd, "data/dta_103.dta", version = 103) # save.dta13(dd, "data/dta_102.dta", version = 102) dd118 <- read.dta13("data/dta_118.dta", replace.strl = TRUE) dd117 <- read.dta13("data/dta_117.dta", replace.strl = TRUE) # dd115 <- read.dta13("data/dta_115.dta") # dd114 <- read.dta13("data/dta_114.dta") # dd113 <- read.dta13("data/dta_113.dta") # dd112 <- read.dta13("data/dta_112.dta") # dd111 <- read.dta13("data/dta_111.dta") # dd110 <- read.dta13("data/dta_110.dta") # dd108 <- read.dta13("data/dta_108.dta") # dd107 <- read.dta13("data/dta_107.dta") # dd106 <- read.dta13("data/dta_106.dta") # dd105 <- read.dta13("data/dta_105.dta") # dd104 <- read.dta13("data/dta_104.dta") # dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("replace.strl TRUE", { # Check that rownames are identical expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) # expect_true(datacompare(dd, dd115)) # expect_true(datacompare(dd, dd114)) # expect_true(datacompare(dd, dd113)) # expect_true(datacompare(dd, dd112)) # expect_true(datacompare(dd, dd111)) # expect_true(datacompare(dd, dd110)) # expect_true(datacompare(dd, dd108)) # expect_true(datacompare(dd, dd107)) # expect_true(datacompare(dd, dd106)) # expect_true(datacompare(dd, dd105)) # expect_true(datacompare(dd, dd104)) # expect_true(datacompare(dd, dd103)) # expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### convert.underscore save #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- data.frame(x.1 = 1) save.dta13(dd, "data/dta_118.dta", version = 118, convert.underscore = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.underscore = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.underscore = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.underscore = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.underscore = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.underscore = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.underscore = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.underscore = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.underscore = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.underscore = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, convert.underscore = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, convert.underscore = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, convert.underscore = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, convert.underscore = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, convert.underscore = TRUE) dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) names(dd) <- "x_1" test_that("convert.underscore TRUE", { # check numerics expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) # check names expect_true(namescompare(dd, dd118)) expect_true(namescompare(dd, dd117)) expect_true(namescompare(dd, dd115)) expect_true(namescompare(dd, dd114)) expect_true(namescompare(dd, dd113)) expect_true(namescompare(dd, dd112)) expect_true(namescompare(dd, dd111)) expect_true(namescompare(dd, dd110)) expect_true(namescompare(dd, dd108)) expect_true(namescompare(dd, dd107)) expect_true(namescompare(dd, dd106)) expect_true(namescompare(dd, dd105)) expect_true(namescompare(dd, dd104)) expect_true(namescompare(dd, dd103)) expect_true(namescompare(dd, dd102)) }) # rm(list = files) #### select.rows #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) dd118 <- read.dta13("data/dta_118.dta", select.rows = 5) dd117 <- read.dta13("data/dta_117.dta", select.rows = 5) dd115 <- read.dta13("data/dta_115.dta", select.rows = 5) dd114 <- read.dta13("data/dta_114.dta", select.rows = 5) dd113 <- read.dta13("data/dta_113.dta", select.rows = 5) dd112 <- read.dta13("data/dta_112.dta", select.rows = 5) dd111 <- read.dta13("data/dta_111.dta", select.rows = 5) dd110 <- read.dta13("data/dta_110.dta", select.rows = 5) dd108 <- read.dta13("data/dta_108.dta", select.rows = 5) dd107 <- read.dta13("data/dta_107.dta", select.rows = 5) dd106 <- read.dta13("data/dta_106.dta", select.rows = 5) dd105 <- read.dta13("data/dta_105.dta", select.rows = 5) dd104 <- read.dta13("data/dta_104.dta", select.rows = 5) dd103 <- read.dta13("data/dta_103.dta", select.rows = 5) dd102 <- read.dta13("data/dta_102.dta", select.rows = 5) unlink("data", recursive = TRUE) dd <- dd[1:5,] test_that("select.rows = 5", { # check numerics expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) dd118 <- read.dta13("data/dta_118.dta", select.rows = c(5,10)) dd117 <- read.dta13("data/dta_117.dta", select.rows = c(5,10)) dd115 <- read.dta13("data/dta_115.dta", select.rows = c(5,10)) dd114 <- read.dta13("data/dta_114.dta", select.rows = c(5,10)) dd113 <- read.dta13("data/dta_113.dta", select.rows = c(5,10)) dd112 <- read.dta13("data/dta_112.dta", select.rows = c(5,10)) dd111 <- read.dta13("data/dta_111.dta", select.rows = c(5,10)) dd110 <- read.dta13("data/dta_110.dta", select.rows = c(5,10)) dd108 <- read.dta13("data/dta_108.dta", select.rows = c(5,10)) dd107 <- read.dta13("data/dta_107.dta", select.rows = c(5,10)) dd106 <- read.dta13("data/dta_106.dta", select.rows = c(5,10)) dd105 <- read.dta13("data/dta_105.dta", select.rows = c(5,10)) dd104 <- read.dta13("data/dta_104.dta", select.rows = c(5,10)) dd103 <- read.dta13("data/dta_103.dta", select.rows = c(5,10)) dd102 <- read.dta13("data/dta_102.dta", select.rows = c(5,10)) unlink("data", recursive = TRUE) dd <- dd[5:10,] test_that("select.rows = c(5,10)", { # check numerics expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) readstata13/src/0000755000176200001440000000000013103052164013150 5ustar liggesusersreadstata13/src/rcpp_readstata.cpp0000644000176200001440000000335113103052164016652 0ustar liggesusers/* * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include using namespace Rcpp; // Reads the binary Stata file // // @param filePath The full systempath to the dta file you want to import. // @param missing logical if missings should be converted outside of Rcpp. // @import Rcpp // @export // [[Rcpp::export]] List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows) { FILE *file = NULL; // File pointer /* * Open the file in binary mode using the "rb" format string * This also checks if the file exists and/or can be opened for reading * correctly */ if ((file = fopen(filePath, "rb")) == NULL) throw std::range_error("Could not open specified file."); /* * check the first byte. */ std::string fbit(1, '\0'); readstring(fbit, file, fbit.size()); std::string expfbit = "<"; // create df List df(0); if (fbit.compare(expfbit) == 0) df = read_dta(file, missing, selectrows); else df = read_pre13_dta(file, missing, selectrows); fclose(file); return df; } readstata13/src/Makevars0000644000176200001440000000016113103052164014642 0ustar liggesusers## -*- mode: makefile; -*- PKG_CPPFLAGS = -I../inst/include -I. PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) readstata13/src/register.c0000644000176200001440000000145313103052164015143 0ustar liggesusers#include #include #include // for NULL #include /* FIXME: * Check these declarations against the C/Fortran source code. * */ /* .Call calls */ extern SEXP readstata13_stata_pre13_save(SEXP, SEXP); extern SEXP readstata13_stata_read(SEXP, SEXP); extern SEXP readstata13_stata_save(SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"readstata13_stata_pre13_save", (DL_FUNC) &readstata13_stata_pre13_save, 2}, {"readstata13_stata_read", (DL_FUNC) &readstata13_stata_read, 3}, {"readstata13_stata_save", (DL_FUNC) &readstata13_stata_save, 2}, {NULL, NULL, 0} }; void R_init_readstata13(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } readstata13/src/rcpp_savestata.cpp0000644000176200001440000004165113103052164016702 0ustar liggesusers/* * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include using namespace Rcpp; using namespace std; // Writes the binary Stata file // // @param filePath The full systempath to the dta file you want to export. // @param dat an R-Object of class data.frame. // @export // [[Rcpp::export]] int stata_save(const char * filePath, Rcpp::DataFrame dat) { uint16_t k = dat.size(); uint64_t n = dat.nrows(); const string timestamp = dat.attr("timestamp"); string datalabel = dat.attr("datalabel"); datalabel[datalabel.size()] = '\0'; CharacterVector valLabels = dat.attr("vallabels"); CharacterVector nvarnames = dat.attr("names"); List chs = dat.attr("expansion.fields"); List formats = dat.attr("formats"); List labeltable = dat.attr("label.table"); List varLabels = dat.attr("var.labels"); List vartypes = dat.attr("types"); const string version = dat.attr("version"); uint8_t const release = atoi(version.c_str()); uint8_t nformatslen = 0, ntimestamp = 0; uint16_t nvarnameslen = 0, nvarLabelslen = 0, nvalLabelslen = 0, ndlabel = 0, lbllen = 0; uint32_t chlen = 0, maxdatalabelsize = 0, maxlabelsize = 32000; switch (release) { case 117: nvarnameslen = 33; nformatslen = 49; nvalLabelslen = 33; nvarLabelslen = 81; maxdatalabelsize = 80; chlen = 33; lbllen = 33; break; case 118: nvarnameslen = 129; nformatslen = 57; nvalLabelslen = 129; nvarLabelslen = 321; maxdatalabelsize = 320; // in utf8 4 * 80 byte chlen = 129; lbllen = 129; break; } const string head = "
"; const string byteord = ""; const string K = ""; const string num = ""; const string lab = ""; const string endheader = "
"; const string startmap = ""; const string endmap = ""; const string startvart = ""; const string endvart = ""; const string startvarn = ""; const string endvarn = ""; const string startsor = ""; const string endsor = ""; const string startform = ""; const string endform = ""; const string startvalLabel = ""; const string endvalLabel = ""; const string startvarlabel= ""; const string endvarlabel= ""; const string startcharacteristics = ""; const string endcharacteristics = ""; const string startch = ""; const string endch = ""; const string startdata = ""; const string enddata = ""; const string startstrl = ""; const string endstrl = ""; const string startvall = ""; const string endvall = ""; const string startlbl = ""; const string endlbl = ""; string end = "
"; end[end.size()] = '\0'; fstream dta (filePath, ios::out | ios::binary); if (dta.is_open()) { /* Stata 13 uses to store 14 byte positions in a dta-file. This * vector is now created and filled with the correct map positions. At * the end of the creation process, all 14 values are known and map will * be filled with the correct values. */ NumericVector map(14); map(0) = dta.tellg(); writestr(head, head.size(), dta); writestr(version, 3, dta); // 117|118 (e.g. Stata 13|14) writestr(byteord, byteord.size(), dta); writestr(sbyteorder, 3, dta); // LSF writestr(K, K.size(), dta); writebin(k, dta, swapit); writestr(num, num.size(), dta); if (release==117) writebin((int32_t)n, dta, swapit); if (release==118) writebin(n, dta, swapit); writestr(lab, lab.size(), dta); /* write a datalabel */ if(!datalabel.empty()) { if (datalabel.size() > maxdatalabelsize) { Rcpp::warning("Datalabel to long. Resizing. Max size is %d.", maxdatalabelsize); datalabel.resize(maxdatalabelsize); datalabel[datalabel.size()] = '\0'; } ndlabel = datalabel.size(); if (release==117) writebin((uint8_t)ndlabel, dta, swapit); if (release==118) writebin(ndlabel, dta, swapit); writestr(datalabel,datalabel.size(), dta); } else { // empty data label defined by byte(s) of zero uint8_t zero = 0; if (release == 117) { writebin(zero, dta, swapit); } if (release == 118) { writebin(zero, dta, swapit); writebin(zero, dta, swapit); } } /* timestamp size is 0 (= no timestamp) or 17 */ writestr(timest, timest.size(), dta); if (!timestamp.empty()) { ntimestamp = 17; writebin(ntimestamp, dta, swapit); writestr(timestamp, timestamp.size(), dta); }else{ writebin(ntimestamp, dta, swapit); } writestr(endheader, endheader.size(), dta); /* ... */ map(1) = dta.tellg(); writestr(startmap, startmap.size(), dta); for (int32_t i = 0; i <14; ++i) { uint64_t nmap = 0; writebin(nmap, dta, swapit); } writestr(endmap, endmap.size(), dta); /* ... */ map(2) = dta.tellg(); writestr(startvart, startvart.size(), dta); uint16_t nvartype; for (uint16_t i = 0; i < k; ++i) { nvartype = as(vartypes[i]); writebin(nvartype, dta, swapit); } writestr(endvart, endvart.size(), dta); /* ... */ map(3) = dta.tellg(); writestr(startvarn, startvarn.size(), dta); for (uint16_t i = 0; i < k; ++i ) { string nvarname = as(nvarnames[i]); nvarname[nvarname.size()] = '\0'; if (nvarname.size() > nvarnameslen) Rcpp::warning("Varname to long. Resizing. Max size is %d", nvarnameslen - 1); writestr(nvarname, nvarnameslen, dta); } writestr(endvarn, endvarn.size(), dta); /* ... */ map(4) = dta.tellg(); writestr(startsor, startsor.size(), dta); uint32_t big_k = k+1; for (uint32_t i = 0; i < big_k; ++i) { uint16_t nsortlist = 0; writebin(nsortlist, dta, swapit); } writestr(endsor, endsor.size(), dta); /* ... */ map(5) = dta.tellg(); writestr(startform, startform.size(), dta); for (uint16_t i = 0; i < k; ++i ) { string nformats = as(formats[i]); if (nformats.size() >= nformatslen) Rcpp::warning("Formats to long. Resizing. Max size is %d", nformatslen); writestr(nformats, nformatslen, dta); } writestr(endform, endform.size(), dta); /* ... */ map(6) = dta.tellg(); writestr(startvalLabel, startvalLabel.size(), dta); for (uint16_t i = 0; i < k; ++i) { string nvalLabels = as(valLabels[i]); nvalLabels[nvalLabels.size()] = '\0'; if (nvalLabels.size() > nvalLabelslen) Rcpp::warning("Vallabel to long. Resizing. Max size is %d", nvalLabelslen - 1); writestr(nvalLabels, nvalLabelslen, dta); } writestr(endvalLabel, endvalLabel.size(), dta); /* ... */ map(7) = dta.tellg(); writestr(startvarlabel, startvarlabel.size(), dta); for (uint16_t i = 0; i < k; ++i) { if (!Rf_isNull(varLabels) && Rf_length(varLabels) > 1) { string nvarLabels = as(varLabels[i]); if (nvarLabels.size() > nvarLabelslen) Rcpp::warning("Varlabel to long. Resizing. Max size is %d", nvarLabelslen - 1); nvarLabels[nvarLabels.size()] = '\0'; writestr(nvarLabels, nvarLabelslen, dta); } else { string nvarLabels = ""; nvarLabels[nvarLabels.size()] = '\0'; writestr(nvarLabels, nvarLabelslen, dta); } } writestr(endvarlabel, endvarlabel.size(), dta); /* ... */ map(8) = dta.tellg(); writestr(startcharacteristics, startcharacteristics.size(), dta); /* ... */ if (chs.size()>0){ for (int32_t i = 0; i(chs[i]); string ch1 = as(ch[0]); ch1[ch1.size()] = '\0'; string ch2 = as(ch[1]); ch2[ch2.size()] = '\0'; string ch3 = as(ch[2]); ch3[ch3.size()] = '\0'; uint32_t nnocharacter = chlen*2 + ch3.size() +1; writebin(nnocharacter, dta, swapit); writestr(ch1, chlen, dta); writestr(ch2, chlen, dta); writestr(ch3,ch3.size()+1, dta); writestr(endch, endch.size(), dta); } } writestr(endcharacteristics, endcharacteristics.size(), dta); /* ... */ map(9) = dta.tellg(); writestr(startdata, startdata.size(), dta); IntegerVector V, O; CharacterVector STRL; for(uint64_t j = 0; j < n; ++j) { for (uint16_t i = 0; i < k; ++i) { int const type = vartypes[i]; switch(type < 2046 ? 2045 : type) { // store numeric as Stata double (double) case 65526: { double val_d = 0; val_d = as(dat[i])[j]; if ( (val_d == NA_REAL) | R_IsNA(val_d) | R_IsNaN(val_d) | std::isinf(val_d) ) val_d = STATA_DOUBLE_NA; writebin(val_d, dta, swapit); break; } // float case 65527: { double val_d = 0; float val_f = 0; val_d = as(dat[i])[j]; if ( (val_d == NA_REAL) | (R_IsNA(val_d)) | R_IsNaN(val_d) | std::isinf(val_d) ) val_f = STATA_FLOAT_NA; else val_f = (double)(val_d); writebin(val_f, dta, swapit); break; } // store integer as Stata long (int32_t) case 65528: { int32_t val_l = 0; val_l = as(dat[i])[j]; if ( (val_l == NA_INTEGER) | (R_IsNA(val_l)) | R_IsNaN(val_l) | std::isinf(val_l) ) val_l = STATA_INT_NA; writebin(val_l, dta, swapit); break; } // int case 65529: { int16_t val_i = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) val_i = STATA_SHORTINT_NA; else val_i = val_l; writebin(val_i, dta, swapit); break; } // byte case 65530: { int8_t val_b = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) val_b = STATA_BYTE_NA; else val_b = val_l; writebin(val_b, dta, swapit); break; } // str case 2045: { int32_t const len = vartypes[i]; string val_s = as(as(dat[i])[j]); if(val_s == "NA") val_s.clear(); writestr(val_s, len, dta); break; } // strL case 32768: { /* Stata uses +1 */ int64_t z = 0; CharacterVector b = as(dat[i]); const string val_strl = as(b[j]); if (!val_strl.empty()) { switch (release) { case 117: { uint32_t v = i+1, o = j+1; writebin(v, dta, swapit); writebin(o, dta, swapit); // push back every v, o and val_strl V.push_back(v); O.push_back(o); break; } case 118: { int16_t v = i+1; int64_t o = j+1; char z[8]; // push back every v, o and val_strl V.push_back(v); O.push_back(o); // z is 'vv-- ----' memcpy(&z[0], &v, sizeof(v)); if (SBYTEORDER == 1) { o <<= 16; } memcpy(&z[2], &o, 6); // z is 'vvoo oooo' dta.write((char*)&z, sizeof(z)); // writestr((char*)&z, sizeof(z), dta); break; } } STRL.push_back(val_strl); } else { writestr((char*)&z, sizeof(z), dta); } break; } } } } writestr(enddata, enddata.size(), dta); /* ... */ map(10) = dta.tellg(); writestr(startstrl, startstrl.size(), dta); int32_t strlsize = STRL.length(); for(int i =0; i < strlsize; ++i ) { const string gso = "GSO"; int32_t v = V[i]; int64_t o = O[i]; uint8_t t = 129; //Stata binary type, no trailing zero. const string strL = as(STRL[i]); uint32_t len = strL.size(); writestr(gso, gso.size(), dta); writebin(v, dta, swapit); if (release==117) writebin((uint32_t)o, dta, swapit); if (release==118) writebin(o, dta, swapit); writebin(t, dta, swapit); writebin(len, dta, swapit); writestr(strL, strL.size(), dta); } writestr(endstrl, endstrl.size(), dta); /* ... */ map(11) = dta.tellg(); writestr(startvall, startvall.size(), dta); if (labeltable.size()>0) { CharacterVector labnames = labeltable.attr("names"); int8_t padding = 0; for (int32_t i=0; i < labnames.size(); ++i) { int32_t txtlen = 0; const string labname = as(labnames[i]); IntegerVector labvalue = labeltable[labname]; int32_t N = labvalue.size(); CharacterVector labelText = labvalue.attr("names"); IntegerVector off; /* * Fill off with offset position and create txtlen */ for (int32_t i = 0; i < labelText.size(); ++i) { string label = as(labelText[i]); uint32_t labellen = label.size()+1; if (labellen > maxlabelsize+1) labellen = maxlabelsize+1; txtlen += labellen; off.push_back ( txtlen-labellen ); } int32_t offI, labvalueI; int32_t nlen = sizeof(N) + sizeof(txtlen) + sizeof(offI)*N + sizeof(labvalueI)*N + txtlen; writestr(startlbl, startlbl.size(), dta); writebin(nlen, dta, swapit); writestr(labname, lbllen, dta); writestr((char*)&padding, 3, dta); writebin(N, dta, swapit); writebin(txtlen, dta, swapit); for (int32_t i = 0; i < N; ++i) { offI = off[i]; writebin(offI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { labvalueI = labvalue[i]; writebin(labvalueI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { string labtext = as(labelText[i]); if (labtext.size() > maxlabelsize) { Rcpp::warning("Label to long. Resizing. Max size is %d", maxlabelsize); labtext.resize(maxlabelsize); // labtext[labtext.size()] = '\0'; } writestr(labtext, labtext.size()+1, dta); } writestr(endlbl, endlbl.size(), dta); } } writestr(endvall, endvall.size(), dta); /* */ map(12) = dta.tellg(); writestr(end, end.size(), dta); /* end-of-file */ map(13) = dta.tellg(); /* seek up to to rewrite it*/ /* ... */ dta.seekg(map(1)); writestr(startmap, startmap.size(), dta); for (int i=0; i <14; ++i) { uint64_t nmap = 0; uint32_t hi = 0, lo = 0; nmap = map(i); hi = (nmap >> 32); lo = nmap; if (SBYTEORDER == 2) { // LSF writebin(lo, dta, swapit); writebin(hi, dta, swapit); } else { // MSF writebin(hi, dta, swapit); writebin(lo, dta, swapit); } } writestr(endmap, endmap.size(), dta); dta.close(); return 0; } else { throw std::range_error("Unable to open file."); return -1; } } readstata13/src/read_pre13_dta.cpp0000644000176200001440000003566413103052164016447 0ustar liggesusers/* * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include "readstata.h" using namespace Rcpp; using namespace std; List read_pre13_dta(FILE * file, const bool missing, const IntegerVector selectrows) { int8_t release = 0; rewind(file); release = readbin(release, file, 0); if (release<102 || release == 109 || release>115) Rcpp::stop("First byte: Not a dta-file we can read."); IntegerVector versionIV(1); versionIV(0) = release; /* * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data. */ uint16_t ndlabel = 81; uint8_t nvarnameslen = 33; int8_t nformatslen = 49; uint8_t nvalLabelslen = 33; uint16_t nvarLabelslen = 81; int32_t chlen = 33; uint8_t lbllen = 33; switch(release) { case 102: ndlabel = 30; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 103: case 104: ndlabel = 32; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 105: case 106: ndlabel = 32; nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; nvarLabelslen = 32; lbllen = 9; break; case 107: case 108: nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; lbllen = 9; break; case 110: case 111: case 112: case 113: nformatslen = 12; break; } CharacterVector byteorderC(1); IntegerVector byteorderI(1); bool swapit = 0; int8_t byteorder = 0; byteorder = readbin(byteorder, file, 0); // 1 = MSF 2 = LSF swapit = std::abs(SBYTEORDER-byteorder); byteorderI(0) = byteorder; // filetype: unnown? int8_t ft = 0; ft = readbin(ft, file, swapit); int8_t unused = 0; unused = readbin(unused, file, swapit); /* * Number of Variables */ uint16_t k = 0; k = readbin(k, file, swapit); /* * Number of Observations */ uint32_t n = 0; n = readbin(n, file, swapit); /* * A dataset may have a label e.g. "Written by R". * First we read its length (ndlabel), later the actual label (datalabel). * ndlabel: length of datalabel (excl. binary 0) * datalabel: string max length 80 */ CharacterVector datalabelCV(1); std::string datalabel(ndlabel, '\0'); if (ndlabel > 0) readstring(datalabel, file, datalabel.size()); else datalabel = ""; datalabelCV(0) = datalabel; CharacterVector timestampCV(1); std::string timestamp(18, '\0'); switch (release) { case 102: case 103: case 104: { timestamp = ""; break; } default: { readstring(timestamp, file, timestamp.size()); break; } } timestampCV(0) = timestamp; /* * vartypes. * 0-2045: strf (String: Max length 2045) * 32768: strL (long String: Max length 2 billion) * 65526: double * 65527: float * 65528: long * 65529: int * 65530: byte */ IntegerVector vartype(k); switch (release) { case 102: case 103: case 104: case 105: case 106: case 107: case 108: case 110: case 112: { uint8_t nvartypec = 0; for (uint16_t i=0; i127) vartype[i] = nvartypec - 127; } break; } case 111: case 113: case 114: case 115: { uint8_t nvartype = 0; for (uint16_t i=0; i ... */ List ch = List(); if (release > 104) { int8_t datatype = 0; uint32_t len = 0; datatype = readbin(datatype, file, swapit); if (release <= 108) len = readbin((uint16_t)len, file, swapit); else len = readbin(len, file, swapit); while (!(datatype==0) && !(len==0)) { std::string chvarname(chlen, '\0'); std::string chcharact(chlen, '\0'); std::string nnocharacter(len-chlen*2, '\0'); readstring(chvarname, file, chvarname.size()); readstring(chcharact, file, chcharact.size()); readstring(nnocharacter, file, nnocharacter.size()); // chs vector CharacterVector chs(3); chs[0] = chvarname; chs[1] = chcharact; chs[2] = nnocharacter; // add characteristics to the list ch.push_front( chs ); datatype = readbin(datatype, file, swapit); if (release <= 108) len = readbin((uint16_t)len, file, swapit); else len = readbin(len, file, swapit); } } /* * data. First a list is created with vectors. The vector type is defined by * vartype. Stata stores data columnwise so we loop over it and store the * data in the list of the first step. Third variable- and row-names are * attatched and the list type is changed to data.frame. */ /* replace vartypes of Stata 8 - 12 with Stata 13 values. */ // 117 contains new variable types (longer strings and strL) std::replace (vartype.begin(), vartype.end(), 251, STATA_BYTE); std::replace (vartype.begin(), vartype.end(), 252, STATA_SHORTINT); std::replace (vartype.begin(), vartype.end(), 253, STATA_INT); std::replace (vartype.begin(), vartype.end(), 254, STATA_FLOAT); std::replace (vartype.begin(), vartype.end(), 255, STATA_DOUBLE); uint32_t nmin = selectrows(0); uint32_t nmax = selectrows(1); uint32_t nn = 0; // if selectrows is c(0,0) use full data if ((nmin == 0) && (nmax == 0)){ nmin = 1; nmax = n; } // make sure that n is not greater nmax if (n < nmax) nmax = n; // neither should nmin be greater if (n < nmin) nmin = n; Rcpp::IntegerVector rvec = seq(nmin, nmax); nn = rvec.size(); // use c indexing starting at 0 nmin = nmin -1; nmax = nmax -1; // 1. create the list List df(k); for (uint16_t i=0; i nmax)) { import = 0; } else { import = 1; // temoprary index values to be reset at the end of the loop tmp_val = j; j = tmp_j; tmp_j++; } for (uint16_t i=0; iSTATA_DOUBLE_NA_MAX)) ) REAL(VECTOR_ELT(df,i))[j] = NA_REAL; else REAL(VECTOR_ELT(df,i))[j] = val_d; } break; } // float case STATA_FLOAT: { float val_f = 0; val_f = readbin(val_f, file, swapit); if (import == 1) { if ((missing == FALSE) & ((val_fSTATA_FLOAT_NA_MAX)) ) REAL(VECTOR_ELT(df,i))[j] = NA_REAL; else REAL(VECTOR_ELT(df,i))[j] = val_f; } break; } //long case STATA_INT: { int32_t val_l = 0; val_l = readbin(val_l, file, swapit); if (import == 1) { if ((missing == FALSE) & ((val_lSTATA_INT_NA_MAX)) ) INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,i))[j] = val_l; } break; } // int case STATA_SHORTINT: { int16_t val_i = 0; val_i = readbin(val_i, file, swapit); if (import == 1) { if ((missing == FALSE) & ((val_iSTATA_SHORTINT_NA_MAX)) ) INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,i))[j] = val_i; } break; } // byte case STATA_BYTE: { int8_t val_b = 0; val_b = readbin(val_b, file, swapit); if (import == 1) { if ((missing == FALSE) & ( (val_bSTATA_BYTE_NA_MAX)) ) INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,i))[j] = val_b; } break; } // strings with 244 or fewer characters default: { int32_t len = 0; len = vartype[i]; std::string val_s (len, '\0'); readstring(val_s, file, val_s.size()); if (import == 1) { as(df[i])[j] = val_s; } break; } } Rcpp::checkUserInterrupt(); } // reset temporary index values to their original values if (import == 1) j = tmp_val; } // 3. Create a data.frame df.attr("row.names") = rvec; df.attr("names") = varnames; df.attr("class") = "data.frame"; /* * labels are seperated by -tags. Labels may appear in any order e.g. * 2 "female" 1 "male 9 "missing". They are stored as tables. * nlen: length of label. * nlabname: label name. * labn: number of labels in this set (e.g. "male" "female" = 2) * txtlen: length of the label text. * off: offset defines where to read a new label in txtlen. */ List labelList = List(); //put labels into this list if (release>105) { // FixMe: the while statement differs and the final check int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0; std::string tag(5, '\0'); bool haslabel = false; // length of value_label_table nlen = readbin(nlen, file, swapit); if (!(feof(file) || ferror(file))) haslabel = true; while(haslabel) { // name of this label set std::string nlabname(lbllen, '\0'); readstring(nlabname, file, nlabname.size()); //padding fseek(file, 3, SEEK_CUR); // value_label_table for actual label set labn = readbin(labn, file, swapit); txtlen = readbin(txtlen, file, swapit); // offset for each label // off0 : label 0 starts at off0 // off1 : label 1 starts at off1 ... IntegerVector off(labn); for (int i=0; i < labn; ++i) { noff = readbin(noff, file, swapit); off[i] = noff; } // needed for match IntegerVector laborder = clone(off); //laborder.erase(labn+1); IntegerVector labordersort = clone(off); //labordersort.erase(labn+1); std::sort(labordersort.begin(), labordersort.end()); // needs txtlen for loop off.push_back(txtlen); // sort offsets so we can read labels sequentially std::sort(off.begin(), off.end()); // create an index to sort lables along the code values // this is done while factor creation IntegerVector indx(labn); indx = match(laborder,labordersort); // code for each label IntegerVector code(labn); for (int i=0; i < labn; ++i) { val = readbin(val, file, swapit); code[i] = val; } // label text CharacterVector label(labn); for (int i=0; i < labn; ++i) { int lablen = off[i+1]-off[i]; std::string lab (lablen, '\0'); readstring(lab, file, lablen); label[i] = lab; } // sort labels according to indx CharacterVector labelo(labn); for (int i=0; i < labn; ++i) { labelo[i] = label[indx[i]-1]; } // create table for actual label set string const labset = nlabname; code.attr("names") = labelo; // add this set to output list labelList.push_front( code, labset); // length of value_label_table nlen = readbin(nlen, file, swapit); if (feof(file) || ferror(file)) break; } } /* * assign attributes to the resulting data.frame */ df.attr("datalabel") = datalabelCV; df.attr("time.stamp") = timestampCV; df.attr("formats") = formats; df.attr("types") = types; df.attr("val.labels") = valLabels; df.attr("var.labels") = varLabels; df.attr("version") = versionIV; df.attr("label.table") = labelList; df.attr("expansion.fields") = ch; df.attr("byteorder") = byteorderI; return df; } readstata13/src/rcpp_pre13_savestata.cpp0000644000176200001440000002742013103052164017712 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include "readstata.h" using namespace Rcpp; using namespace std; // Writes the binary Stata file // // @param filePath The full systempath to the dta file you want to export. // @param dat an R-Object of class data.frame. // @export // [[Rcpp::export]] int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat) { uint16_t k = dat.size(); uint32_t n = dat.nrows(); int8_t byteorder = SBYTEORDER; string timestamp = dat.attr("timestamp"); timestamp.resize(18); string datalabel = dat.attr("datalabel"); datalabel[datalabel.size()] = '\0'; CharacterVector valLabels = dat.attr("vallabels"); CharacterVector nvarnames = dat.attr("names"); List chs = dat.attr("expansion.fields"); List formats = dat.attr("formats"); List labeltable = dat.attr("label.table"); List varLabels = dat.attr("var.labels"); List vartypes = dat.attr("types"); int8_t version = as(dat.attr("version")); fstream dta (filePath, ios::out | ios::binary); if (dta.is_open()) { uint32_t ndlabel = 81; uint32_t nformatslen = 49; uint32_t nvarnameslen = 33; uint32_t nvalLabelslen = 33; uint32_t nvarLabelslen = 81; uint32_t chlen = 33; uint32_t maxlabelsize = 32000; uint32_t maxstrsize = 244; if (version<111 || version==112) maxstrsize = 80; switch(version) { case 102: ndlabel = 30; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 103: case 104: ndlabel = 32; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 105: case 106:// unknown version (SE?) ndlabel = 32; nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; nvarLabelslen = 32; break; case 107: // unknown version (SE?) case 108: nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; case 110: case 111: case 112: case 113: nformatslen = 12; break; } writebin(version, dta, swapit); // format writebin(byteorder, dta, swapit); // LSF int8_t ft = 1; // filetype writebin(ft, dta, swapit); int8_t unused = 0; // unused writebin(unused, dta, swapit); writebin(k, dta, swapit); // nvars writebin(n, dta, swapit); // nobs /* write a datalabel */ if (datalabel.size() > ndlabel) Rcpp::warning("Datalabel too long. Resizing. Max size is %d.", ndlabel - 1); writestr(datalabel, ndlabel, dta); /* timestamp size is 17 */ if (version > 104) { if (timestamp.size() > 18) { Rcpp::warning("Timestamp too long. Dropping."); timestamp = ""; } writestr(timestamp, timestamp.size(), dta); } /* ... */ uint8_t nvartype; for (uint16_t i = 0; i < k; ++i) { nvartype = as(vartypes[i]); if(version<111 || version==112) { char c[2]; switch(nvartype) { case 255: strcpy(c, "d"); c[1] = '\0'; dta.write(c, 1); break; case 254: strcpy(c, "f"); c[1] = '\0'; dta.write(c, 1); break; case 253: strcpy(c, "l"); c[1] = '\0'; dta.write(c, 1); break; case 252: strcpy(c, "i"); c[1] = '\0'; dta.write(c, 1); break; case 251: strcpy(c,"b"); c[1] = '\0'; dta.write(c, 1); break; default: char d = char(nvartype+127); dta.write(&d, 1); break; } } else writebin(nvartype, dta, swapit); } /* ... */ for (uint16_t i = 0; i < k; ++i ) { string nvarname = as(nvarnames[i]); if (nvarname.size() > nvarnameslen) Rcpp::warning("Varname too long. Resizing. Max size is %d", nvarnameslen - 1); writestr(nvarname, nvarnameslen, dta); } /* ... */ uint32_t big_k = k+1; for (uint32_t i = 0; i < big_k; ++i) { uint16_t nsortlist = 0; writebin(nsortlist, dta, swapit); } /* ... */ for (uint16_t i = 0; i < k; ++i ) { string nformats = as(formats[i]); if (nformats.size() > nformatslen) Rcpp::warning("Formats too long. Resizing. Max size is %d", nformatslen - 1); writestr(nformats, nformatslen, dta); } /* ... */ for (uint16_t i = 0; i < k; ++i ) { string nvalLabels = as(valLabels[i]); if (nvalLabels.size() > nvalLabelslen) Rcpp::warning("Vallabel too long. Resizing. Max size is %d", nvalLabelslen - 1); writestr(nvalLabels, nvalLabelslen, dta); } /* ... */ for (uint16_t i = 0; i < k; ++i) { string nvarLabels = ""; if (!Rf_isNull(varLabels) && Rf_length(varLabels) > 1) { nvarLabels = as(varLabels[i]); if (nvarLabels.size() > nvarLabelslen) Rcpp::warning("Varlabel too long. Resizing. Max size is %d", nvarLabelslen - 1); } writestr(nvarLabels, nvarLabelslen, dta); } /* ... */ if (version > 104) { int8_t datatype = 0; uint32_t len = 0; if (chs.size()>0){ for (int32_t i = 0; i(chs[i]); string ch1 = as(ch[0]); ch1[ch1.size()] = '\0'; string ch2 = as(ch[1]); ch2[ch2.size()] = '\0'; string ch3 = as(ch[2]); ch3[ch3.size()] = '\0'; len = chlen + chlen + ch3.size()+1; datatype = 1; writebin(datatype, dta, swapit); if(version<=108) writebin((int16_t)len, dta, swapit); else writebin(len, dta, swapit); writestr(ch1, chlen, dta); writestr(ch2, chlen, dta); writestr(ch3, ch3.size()+1, dta); } } // five bytes of zero end characteristics datatype = 0; len = 0; writebin(datatype, dta, swapit); if (version<=108) writebin((int16_t)len, dta, swapit); else writebin(len, dta, swapit); } /* ... */ for(uint32_t j = 0; j < n; ++j) { for (uint16_t i = 0; i < k; ++i) { int const type = vartypes[i]; switch(type) { // store numeric as Stata double (double) case 255: { double val_d = 0; val_d = as(dat[i])[j]; if ( (val_d == NA_REAL) | R_IsNA(val_d) ) val_d = STATA_DOUBLE_NA; writebin(val_d, dta, swapit); break; } // float case 254: { double val_d = 0; float val_f = 0; val_d = as(dat[i])[j]; if ((val_d == NA_REAL) | (R_IsNA(val_d)) ) val_f = STATA_FLOAT_NA; else val_f = (float)(val_d); writebin(val_f, dta, swapit); break; } // store integer as Stata long (int32_t) case 253: { int32_t val_l = 0; val_l = as(dat[i])[j]; if ( (val_l == NA_INTEGER) | (R_IsNA(val_l)) ) { if(version>111) val_l = STATA_INT_NA; else val_l = STATA_INT_NA_108; } writebin(val_l, dta, swapit); break; } // int case 252: { int16_t val_i = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) val_i = STATA_SHORTINT_NA; else val_i = val_l; writebin(val_i, dta, swapit); break; } // byte case 251: { int8_t val_b = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) { if (version>104) val_b = STATA_BYTE_NA; else val_b = STATA_BYTE_NA_104; } else { val_b = val_l; } writebin(val_b, dta, swapit); break; } default: { int32_t len = vartypes[i]; string val_s = as(as(dat[i])[j]); if(val_s == "NA") val_s.clear(); // Stata 6-12 can only store 244 byte strings if(val_s.size()>maxstrsize) { Rcpp::warning("Character value too long. Resizing. Max size is %d.", maxstrsize); } writestr(val_s, len, dta); break; } } } } /* ... */ if ((labeltable.size()>0) & (version>105)) { CharacterVector labnames = labeltable.attr("names"); int8_t padding = 0; for (int32_t i=0; i < labnames.size(); ++i) { int32_t txtlen = 0; string labname = as(labnames[i]); IntegerVector labvalue = labeltable[labname]; int32_t N = labvalue.size(); CharacterVector labelText = labvalue.attr("names"); IntegerVector off; /* * Fill off with offset position and create txtlen */ for (int32_t i = 0; i < labelText.size(); ++i) { string label = as(labelText[i]); uint32_t labellen = label.size()+1; if (labellen > maxlabelsize+1) labellen = maxlabelsize+1; txtlen += labellen; off.push_back ( txtlen-labellen ); } int32_t offI, labvalueI; int32_t nlen = sizeof(N) + sizeof(txtlen) + sizeof(offI)*N + sizeof(labvalueI)*N + txtlen; writebin(nlen, dta, swapit); writestr(labname, nvarnameslen, dta); dta.write((char*)&padding,3); writebin(N, dta, swapit); writebin(txtlen, dta, swapit); for (int32_t i = 0; i < N; ++i) { offI = off[i]; writebin(offI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { labvalueI = labvalue[i]; writebin(labvalueI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { string labtext = as(labelText[i]); if (labtext.size() > maxlabelsize) { Rcpp::warning("Label too long. Resizing. Max size is %d", maxlabelsize); labtext.resize(maxlabelsize); // labtext[labtext.size()] = '\0'; } writestr(labtext, labtext.size()+1, dta); } } } dta.close(); return 0; } else { Rcpp::stop("Unable to open file."); return -1; } } readstata13/src/read_dta.cpp0000644000176200001440000004562013103052164015426 0ustar liggesusers/* * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include "readstata.h" using namespace Rcpp; using namespace std; List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { // stata_dta>
test("stata_dta>
", file); test("", file); /* * version is a 4 byte character e.g. "117" */ int8_t fversion = 117L; //f = first int8_t lversion = 118L; //l = last std::string version(3, '\0'); readstring(version, file, version.size()); int8_t const release = atoi(version.c_str()); IntegerVector versionIV(1); versionIV(0) = release; // check the release version. if (releaselversion) { Rcpp::warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release); return -1; } uint8_t nvarnameslen = 0; int8_t nformatslen = 0; uint8_t nvalLabelslen = 0; uint16_t nvarLabelslen = 0; int32_t chlen = 0; uint8_t lbllen = 0; switch(release) { case 117: nvarnameslen = 33; nformatslen = 49; nvalLabelslen = 33; nvarLabelslen = 81; chlen = 33; lbllen = 33; break; case 118: nvarnameslen = 129; nformatslen = 57; nvalLabelslen = 129; nvarLabelslen = 321; chlen = 129; lbllen = 129; break; } // test("", file); test("", file); /* * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data. */ std::string byteorder(3, '\0'); readstring(byteorder,file, byteorder.size()); // test("", file); test("", file); bool swapit = 0; swapit = strcmp(byteorder.c_str(), sbyteorder); /* * Number of Variables */ uint16_t k = 0; k = readbin(k, file, swapit); // test("", file); test("", file); /* * Number of Observations */ uint64_t n = 0; if(release==117) { n = readbin((uint32_t)n, file, swapit); } if (release ==118) { n = readbin(n, file, swapit); } // test("", file); test(" test("", file); test("", file); /* * A dataset may have a timestamp. If it has a timestamp the length of the * timestamp (ntimestamp) is 17. Else it is zero. * ntimestamp: 0 or 17 * timestamp: empty or 17 byte string */ uint8_t ntimestamp = 0; ntimestamp = readbin(ntimestamp, file, swapit); std::string timestamp(17, '\0'); if (ntimestamp == 17) // ntimestap is 0 or 17 { readstring(timestamp, file, timestamp.size()); } else { timestamp = ""; } CharacterVector timestampCV = timestamp; //
test("
", file); test("", file); /* * Stata stores the byteposition of certain areas of the file here. Currently * this is of no use to us. * 1. * 2. * 3. * 4. * 5. * 6. * 7. * 8. * 9. * 10. * 11. * 12. * 13. * 14. end-of-file */ NumericVector map(14); for (int i=0; i <14; ++i) { uint64_t nmap = 0; nmap = readbin(nmap, file, swapit); map[i] = nmap; } // test("
", file); test("", file); /* * vartypes. * 0-2045: strf (String: Max length 2045) * 32768: strL (long String: Max length 2 billion) * 65526: double * 65527: float * 65528: long * 65529: int * 65530: byte */ IntegerVector vartype(k); for (uint16_t i=0; i test("", file); test("", file); /* * varnames. */ std::string nvarnames(nvarnameslen, '\0'); CharacterVector varnames(k); for (uint16_t i=0; i test("", file); test("", file); /* * sortlist. Stata stores the information which variable of a dataset was * sorted. Depending on byteorder sortlist is written different. Currently we * do not use this information. * Vector size is k+1. */ uint32_t big_k = k+1; IntegerVector sortlist(big_k); for (uint32_t i=0; i test("", file); test("", file); /* * formats handle how Stata prints a variable. Currently we do not use this * information. */ std::string nformats(nformatslen, '\0'); CharacterVector formats(k); for (uint16_t i=0; i test("", file); test("",file); /* * value_label_names. Stata stores variable labels by names. * nvalLabels: length of the value_label_name * valLabels: */ std::string nvalLabels(nvalLabelslen, '\0'); CharacterVector valLabels(k); for (uint16_t i=0; i test("", file); test("", file); /* * variabel_labels */ std::string nvarLabels (nvarLabelslen, '\0'); CharacterVector varLabels(k); for (uint16_t i=0; i test("", file); test("", file); /* * characteristics. Stata can store additional information this way. It may * contain notes (for the dataset or a variable) or about label language sets. * Characteristics are not documented. We export them as attribute: * expansion.fields. Characteristics are seperated by tags. Each has: * nocharacter: length of the characteristics * chvarname: varname (binary 0 terminated) * chcharact: characteristicsname (binary 0 terminated) * nnocharacter: contes (binary 0 terminated) */ std::string chtag = ""; std::string tago(4, '\0'); readstring(tago, file, tago.size()); List ch = List(); CharacterVector chs(3); while (chtag.compare(tago)==0) { uint32_t nocharacter = 0; nocharacter = readbin(nocharacter, file, swapit); std::string chvarname(chlen, '\0'); std::string chcharact(chlen, '\0'); std::string nnocharacter(nocharacter-chlen*2, '\0'); readstring(chvarname, file, chvarname.size()); readstring(chcharact, file, chcharact.size()); readstring(nnocharacter, file, nnocharacter.size()); // chs vector CharacterVector chs(3); chs[0] = chvarname; chs[1] = chcharact; chs[2] = nnocharacter; // add characteristics to the list ch.push_front( chs ); // test("", file); // read next tag readstring(tago, file, tago.size()); } //[ test("aracteristics>", file); test("", file); /* * data. First a list is created with vectors. The vector type is defined by * vartype. Stata stores data columnwise so we loop over it and store the * data in the list of the first step. Third variable- and row-names are * attatched and the list type is changed to data.frame. */ uint64_t nmin = selectrows(0); uint64_t nmax = selectrows(1); uint64_t nn = 0; // if selectrows is c(0,0) use full data if ((nmin == 0) && (nmax == 0)){ nmin = 1; nmax = n; } // make sure that n is not greater nmax if (n < nmax) nmax = n; // neither should nmin be greater if (n < nmin) nmin = n; Rcpp::IntegerVector rvec = seq(nmin, nmax); nn = rvec.size(); // use c indexing starting at 0 nmin = nmin -1; nmax = nmax -1; // 1. create the list List df(k); for (uint16_t i=0; i nmax)) { import = 0; } else { import = 1; // temoprary index values to be reset at the end of the loop tmp_val = j; j = tmp_j; tmp_j++; } for (uint16_t i=0; iSTATA_DOUBLE_NA_MAX)) ) REAL(VECTOR_ELT(df,i))[j] = NA_REAL; else REAL(VECTOR_ELT(df,i))[j] = val_d; } break; } // float case 65527: { float val_f = 0; val_f = readbin(val_f, file, swapit); if (import == 1) { if ((missing == 0) && ((val_fSTATA_FLOAT_NA_MAX)) ) REAL(VECTOR_ELT(df,i))[j] = NA_REAL; else REAL(VECTOR_ELT(df,i))[j] = val_f; } break; } //long case 65528: { int32_t val_l = 0; val_l = readbin(val_l, file, swapit); if (import == 1) { if ((missing == 0) && ((val_lSTATA_INT_NA_MAX)) ) INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,i))[j] = val_l; } break; } // int case 65529: { int16_t val_i = 0; val_i = readbin(val_i, file, swapit); if (import == 1) { if ((missing == 0) && ((val_iSTATA_SHORTINT_NA_MAX)) ) INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,i))[j] = val_i; } break; } // byte case 65530: { int8_t val_b = 0; val_b = readbin(val_b, file, swapit); if (import == 1) { if (missing == 0 && ( (val_bSTATA_BYTE_NA_MAX)) ) INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,i))[j] = val_b; } break; } // strings with 2045 or fewer characters case 2045: { int32_t len = 0; len = vartype[i]; std::string val_s (len, '\0'); readstring(val_s, file, val_s.size()); if (import == 1) { as(df[i])[j] = val_s; } break; } // string of any length case 32768: {// strL 2*4bit or 2 + 6 bit //char val_strl[22]; // FixMe: Strl in 118 switch (release) { case 117: { uint32_t v = 0, o = 0; v = readbin(v, file, swapit); o = readbin(o, file, swapit); stringstream val_stream; val_stream << v << '_' << o; string val_strl = val_stream.str(); //sprintf(val_strl, "%010d%010d", v, o); if (import == 1) { as(df[i])[j] = val_strl; } break; } case 118: { int16_t v = 0; int64_t o = 0, z = 0; z = readbin(z, file, swapit); // works for LSF on little- and big-endian if(byteorder.compare("LSF")==0) { v = (int16_t)z; o = (z >> 16); } // works if we read a big-endian file on little-endian if(byteorder.compare("MSF")==0) { v = (z >> 48) & ((1 << 16) - 1); o = z & ((1 << 16) - 1); } stringstream val_stream; val_stream << v << '_' << o; string val_strl = val_stream.str(); if (import == 1) { as(df[i])[j] = val_strl; } break; } } } } Rcpp::checkUserInterrupt(); } // reset temporary index values to their original values if (import == 1) j = tmp_val; } // 3. Create a data.frame df.attr("row.names") = rvec; df.attr("names") = varnames; df.attr("class") = "data.frame"; // test("", file); test("", file); /* * strL. Stata 13 introduced long strings up to 2 billon characters. strLs are * sperated by "GSO". * (v,o): Position in the data.frame. * t: 129/130 defines whether or not the strL is stored with a binary 0. * len: length of the strL. * strl: long string. */ std::string gso = "GSO"; std::string tags(3, '\0'); readstring(tags, file, tags.size()); //put strLs into a named vector CharacterVector strlvalues(0); CharacterVector strlnames(0); while(gso.compare(tags)==0) { CharacterVector strls(2); string ref; // FixMe: Strl in 118 switch (release) { case 117: { uint32_t v = 0, o = 0; v = readbin(v, file, swapit); o = readbin(o, file, swapit); stringstream val_stream; val_stream << v << '_' << o; ref.assign(val_stream.str()); //sprintf(ref, "%010d%010d", v, o); break; } case 118: { uint32_t v = 0; uint64_t o = 0; // uint64_t z = 0; v = readbin(v, file, swapit); o = readbin(o, file, swapit); // z = readbin(z, file, swapit); stringstream val_stream; val_stream << v << '_' << o; ref.assign(val_stream.str()); //sprintf(ref, "%010d%010ld", v, o); break; } } // (129 = binary) | (130 = ascii) uint8_t t = 0; t = readbin(t, file, swapit); uint32_t len = 0; len = readbin(len, file, swapit); // 129 len = len; 130 len = len +'\0'; std::string strl(len, '\0'); readstring(strl, file, strl.size()); strlvalues.push_back( strl ); strlnames.push_back( ref ); readstring(tags, file, tags.size()); } // set identifier as name strlvalues.attr("names") = strlnames; // after strls //[ test("trls>", file); test("", file); /* * labels are seperated by -tags. Labels may appear in any order e.g. * 2 "female" 1 "male 9 "missing". They are stored as tables. * nlen: length of label. * nlabname: label name. * labn: number of labels in this set (e.g. "male" "female" = 2) * txtlen: length of the label text. * off: offset defines where to read a new label in txtlen. */ std::string lbltag = ""; std::string tag(5, '\0'); readstring(tag, file, tag.size()); List labelList = List(); //put labels into this list while(lbltag.compare(tag)==0) { int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0; // length of value_label_table nlen = readbin(nlen, file, swapit); // name of this label set std::string nlabname(lbllen, '\0'); readstring(nlabname, file, nlabname.size()); //padding fseek(file, 3, SEEK_CUR); // value_label_table for actual label set labn = readbin(labn, file, swapit); txtlen = readbin(txtlen, file, swapit); // offset for each label // off0 : label 0 starts at off0 // off1 : label 1 starts at off1 ... IntegerVector off(labn); for (int i=0; i < labn; ++i) { noff = readbin(noff, file, swapit); off[i] = noff; } // needed for match IntegerVector laborder = clone(off); //laborder.erase(labn+1); IntegerVector labordersort = clone(off); //labordersort.erase(labn+1); std::sort(labordersort.begin(), labordersort.end()); // needs txtlen for loop off.push_back(txtlen); // sort offsets so we can read labels sequentially std::sort(off.begin(), off.end()); // create an index to sort lables along the code values // this is done while factor creation IntegerVector indx(labn); indx = match(laborder,labordersort); // code for each label IntegerVector code(labn); for (int i=0; i < labn; ++i) { val = readbin(val, file, swapit); code[i] = val; } // label text CharacterVector label(labn); for (int i=0; i < labn; ++i) { int lablen = off[i+1]-off[i]; std::string lab (lablen, '\0'); readstring(lab, file, lablen); label[i] = lab; } // sort labels according to indx CharacterVector labelo(labn); for (int i=0; i < labn; ++i) { labelo[i] = label[indx[i]-1]; } // create table for actual label set string const labset = nlabname; code.attr("names") = labelo; // add this set to output list labelList.push_front( code, labset); fseek(file, 6, SEEK_CUR); // readstring(tag, file, tag.size()); } /* * Final test if we reached the end of the file * close the file */ // [ test("ue_labels>", file); test("", file); /* * assign attributes to the resulting data.frame */ df.attr("datalabel") = datalabelCV; df.attr("time.stamp") = timestampCV; df.attr("formats") = formats; df.attr("types") = vartype; df.attr("val.labels") = valLabels; df.attr("var.labels") = varLabels; df.attr("version") = versionIV; df.attr("label.table") = labelList; df.attr("expansion.fields") = ch; df.attr("strl") = strlvalues; df.attr("byteorder") = wrap(byteorder); return df; } readstata13/src/Makevars.win0000644000176200001440000000016113103052164015436 0ustar liggesusers## -*- mode: makefile; -*- PKG_CPPFLAGS = -I../inst/include -I. PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) readstata13/src/RcppExports.cpp0000644000176200001440000000343113103052164016146 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; // stata_pre13_save int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat); RcppExport SEXP readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); rcpp_result_gen = Rcpp::wrap(stata_pre13_save(filePath, dat)); return rcpp_result_gen; END_RCPP } // stata_read List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows); RcppExport SEXP readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< const bool >::type missing(missingSEXP); Rcpp::traits::input_parameter< const IntegerVector >::type selectrows(selectrowsSEXP); rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows)); return rcpp_result_gen; END_RCPP } // stata_save int stata_save(const char * filePath, Rcpp::DataFrame dat); RcppExport SEXP readstata13_stata_save(SEXP filePathSEXP, SEXP datSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); rcpp_result_gen = Rcpp::wrap(stata_save(filePath, dat)); return rcpp_result_gen; END_RCPP } readstata13/NAMESPACE0000644000176200001440000000104513077075424013616 0ustar liggesusers# Generated by roxygen2: do not edit by hand export("varlabel<-") export(as.caldays) export(get.label) export(get.label.name) export(get.lang) export(get.origin.codes) export(read.dta13) export(save.dta13) export(set.label) export(set.lang) export(stbcal) export(varlabel) import(Rcpp) importFrom(stats,complete.cases) importFrom(stats,na.omit) importFrom(utils,download.file) importFrom(utils,localeToCharset) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) useDynLib(readstata13) useDynLib(readstata13, .registration = TRUE) readstata13/NEWS0000644000176200001440000000540413103037376013073 0ustar liggesusers[0.9.0] - generate unique factor labels to prevent errors in factor definition - check interrupt for long read - fix storage size of character vectors in save.dta13 - fix saving characters containing missings - implement partial reading of dta-files - fix an integer bug with saving data.frames of length requiring uint64_t 0.8.5 - fix errors on big-endian systems 0.8.4 - fix valgrind errors. converting from dta.write to writestr - fix for empty data label - make replace.strl default 0.8.3 - restrict length of varnames to 32 chars for compatibility with Stata 14 - Stop compression of doubles as floats. Now test if compression of doubles as interger types is possible. - add many function tests 0.8.2 - save NA values in character vector as empty string - convert.underscore=T will convert all non-literal characters to underscores - fix saving of Dates - save with convert.factors by default - test for NaN and inf values while writing missing values and replace with NA - remove message about saving factors 0.8.1 - convert non-integer variables to factors (nonint.factors=T) - working with strL variables is now a lot faster (thank to Magnus Thor Torfason) - fix handling of large datasets - some code cleanups 0.8 - implement reading all version prior 13. - clean up code. - fix a crash when varlables do not match ncols. - update leap seconds R code with foreign. 0.7.1 - fix saving of files > 2GB 0.7 - read and write Stata 14 files (ver 118) - fix save for variables without non-missing values - read strings from different file encodings - code cleanups 0.6.1 - fix heap overflow 0.6 - various fixes - reading stbcal-files 0.5 - write dta-files - read/write LSF and MSF files - source testing and cleaning - support for multiple label languages (see http://www.stata.com/manuals13/dlabellanguage.pdf) - additional tools for label handling 0.4 - convert.dates from foreign::read.dta() - handle different NA values - convert strings to system encoding - some checks on label assignment 0.3 - reading file from url. Example: `read.dta13("http://www.stata-press.com/data/r13/auto.dta")` - convert.underscore from foreign::read.dta(): converts _ to . - missing.type parts from foreign::read.dta(). If TRUE return "missing" - replace.strl option to replace the reference to a STRL string in the data.frame with the actual value 0.2 - read stata characteristics and save them in extension.table attribute - more robust handling of factor labels - set file encoding for all strings and convert them to system encoding - fixed compiler warnings 0.1 - reading data files and create a data.frame - assign variable names - read the new strL strings and save them as attribute - convert stata label to factors and save them as attribute - read some meta data (timestamp, dataset label, formats,...) readstata13/R/0000755000176200001440000000000013103030636012562 5ustar liggesusersreadstata13/R/readstata13.R0000644000176200001440000000103613077075424015037 0ustar liggesusers#' Import Stata Data Files #' #' Function to read the Stata file format into a data.frame. #' #' #' @author Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' #' @name readstata13 #' @docType package #' @useDynLib readstata13, .registration = TRUE #' @import Rcpp #' @note If you catch a bug, please do not sue us, we do not have any money. #' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata #' Versions < 13 NULL readstata13/R/RcppExports.R0000644000176200001440000000102013103030636015167 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 stata_pre13_save <- function(filePath, dat) { .Call('readstata13_stata_pre13_save', PACKAGE = 'readstata13', filePath, dat) } stata_read <- function(filePath, missing, selectrows) { .Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing, selectrows) } stata_save <- function(filePath, dat) { .Call('readstata13_stata_save', PACKAGE = 'readstata13', filePath, dat) } readstata13/R/tools.R0000644000176200001440000003562313103051163014054 0ustar liggesusers# # Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # Wrapper Around iconv Calls for Code Readability # # @param x element to be converted # @param encoding encoding to be used. # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} read.encoding <- function(x, fromEncoding, encoding) { iconv(x, from=fromEncoding, to=encoding , sub="byte") } save.encoding <- function(x, encoding) { iconv(x, to=encoding, sub="byte") } # Function to check if directory exists # @param x file path dir.exists13 <-function(x) { path <- dirname(x) return(file.exists(path)) } # Construct File Path # # @param path path to dta file # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} get.filepath <- function(path=""){ if(substring(path, 1, 1) == "~") { filepath <- path.expand(path) } else { filepath <- path } if(!file.exists(filepath)) { return("File does not exist.") } return(filepath) } #' Show Default Label Language #' #' Displays informations about the defined label languages. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param print \emph{logical.} If \code{TRUE}, print available languages and default language. #' @return Returns a list with two components: #' \describe{ #' \item{languages:}{Vector of label languages used in the dataset} #' \item{default:}{Name of the actual default label language, otherwise NA} #' } #' @details Stata allows to define multiple label sets in different languages. This functions reports the #' available languages and the selected default language. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export get.lang <- function(dat, print=T) { ex <- attr(dat, "expansion.fields") lang <- list() if(length(grep("_lang_list", ex)) > 0) { lang$languages <- strsplit(ex[[grep("_lang_list", ex)]][3], " ")[[1]] } else { lang$languages <- NA } lang$default <- ifelse(length(grep("_lang_c", ex)) > 0, ex[[grep("_lang_c", ex)]][3], NA) if(print) { cat("Available languages:\n ") cat(paste0(lang$languages, "\n")) cat("\nDefault language:\n") cat(paste0(" ",lang$default, "\n")) return(invisible(lang)) } return(lang) } #' Get Names of Stata Label Set #' #' Retrieves the Stata label set in the dataset for all or an vector of variable names. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param var.name \emph{character vector.} Variable names. If \code{NULL}, get names of all label sets. #' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA #' @return Returns an named vector of variable labels #' @details Stata stores factor labels in variable independent labels sets. This function retrieves the name of the label set for a variable. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export get.label.name <- function(dat, var.name=NULL, lang=NA) { vnames <- names(dat) if (is.na(lang) | lang == get.lang(dat, F)$default) { labelsets <- attr(dat, "val.labels") names(labelsets) <- vnames } else if (is.character(lang)) { ex <- attr(dat, "expansion.fields") varname <- sapply(ex[grep(paste0("_lang_l_", lang), ex)], function(x) x[1]) labelsets.tmp <- sapply(ex[grep(paste0("_lang_l_", lang), ex)], function(x) x[3]) names(labelsets.tmp) <- varname labelsets <- rep("", length(vnames)) names(labelsets) <- vnames labelsets[varname] <- labelsets.tmp[varname] } if(is.null(var.name)) { return(labelsets) } else { return(labelsets[var.name]) } } #' Get Origin Code Numbers for Factors #' #' Recreates the code numbers of a factor as stored in the Stata dataset. #' #' @param x \emph{factor.} Factor to obtain code for #' @param label.table \emph{table.} Table with factor levels obtained by \code{\link{get.label}}. #' @return Returns an integer with original codes #' @details While converting numeric variables into factors, the original code numbers are lost. This function reconstructs the codes from the attribute \code{label.table}. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' labname <- get.label.name(dat,"type") #' labtab <- get.label(dat, labname) #' #' # comparsion #' get.origin.codes(dat$type, labtab) #' as.integer(dat$type) #' @export get.origin.codes <- function(x, label.table) { if(is.factor(x)) { fac <- as.character(x) return(as.integer(label.table[fac])) } else { message("x is no factor.") } } #' Get Stata Label Table for a Label Set #' #' Retrieve the value labels for a specific Stata label set. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param label.name \emph{character.} Name of the Stata label set #' @return Returns a named vector of code numbers #' @details This function returns the table of factor levels which represent a Stata label set. #' The name of a label set for a variable can be obtained by \code{\link{get.label.name}}. #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' labname <- get.label.name(dat,"type") #' get.label(dat, labname) #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export get.label <- function(dat, label.name) { return(attr(dat, "label.table")[label.name][[1]]) } #' Assign Stata Labels to a Variable #' #' Assign value labels from a Stata label set to a variable. If duplicated labels are found, #' unique labels will be generated according the following scheme: "label_(integer code)". #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param var.name \emph{character.} Name of the variable in the data.frame #' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA #' @return Returns a labeled factor #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), convert.factors=FALSE) #' #' # compare vectors #' set.label(dat, "type") #' dat$type #' #' # German label #' set.label(dat, "type", "de") #' @export set.label <- function(dat, var.name, lang=NA) { if(is.factor(dat[,var.name])) { tmp <- get.origin.codes(dat[,var.name], get.label(dat, get.label.name(dat, var.name))) } else { tmp <- dat[,var.name] } labtable <- get.label(dat, get.label.name(dat, var.name, lang)) #check for duplicated labels labcount <- table(names(labtable)) if(any(labcount > 1)) { warning(paste0("\n ",var.name, ":\n Duplicated factor levels detected - generating unique labels.\n")) labdups <- names(labtable) %in% names(labcount[labcount > 1]) # generate unique labels from assigned label and code number names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")") } return(factor(tmp, levels=labtable, labels=names(labtable)) ) } #' Get and assign Stata Variable Labels #' #' Retrieve or set variable labels for a dataset. #' #' @name varlabel #' @rdname varlabel #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param var.name \emph{character vector.} Variable names. If NULL, get label for all variables. #' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA #' @param value \emph{character vector.} Vector of variable names. #' @return Returns an named vector of variable labels #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @aliases varlabel #' @aliases 'varlabel<-' NULL #' @rdname varlabel #' @export varlabel <- function(dat, var.name=NULL, lang=NA) { vnames <- names(dat) if (is.na(lang) | lang == get.lang(dat, F)$default) { varlabel <- attr(dat, "var.labels") names(varlabel) <- vnames } else if (is.character(lang)) { ex <- attr(dat, "expansion.fields") varname <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[1]) varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3]) names(varlabel) <- varname } if(is.null(var.name)) { # order by data.frame columns and return return(varlabel[vnames]) } else { return(varlabel[var.name]) } } #' @rdname varlabel #' @export 'varlabel<-' <- function(dat, value) { nlabs <- length(attr(dat, "var.labels")) if(length(value)==nlabs) { attr(x, "var.labels") <- value } else { warning(paste("Vector of new labels must have",nlabs,"entries.")) } dat } #' Assign Stata Language Labels #' #' Changes default label language for a dataset. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA #' @param generate.factors \emph{logical.} If \code{TRUE}, missing factor levels are generated. #' @return Returns a data.frame with value labels in language "lang". #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' get.lang(dat) #' varlabel(dat) #' #' # set German label #' datDE <- set.lang(dat, "de") #' get.lang(datDE) #' varlabel(datDE) #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @importFrom stats na.omit #' @importFrom utils txtProgressBar setTxtProgressBar #' @export set.lang <- function(dat, lang=NA, generate.factors=FALSE) { if (is.na(lang) | lang == get.lang(dat, F)$default) { return(dat) } else if (is.character(lang)) { vnames <- names(dat) types <- attr(dat, "types") label <- attr(dat, "label.table") val.labels <- get.label.name(dat, NULL, lang) oldval.labels <- get.label.name(dat) oldval.labels <- oldval.labels[!is.na(oldval.labels)] oldlang <- get.lang(dat, F)$default cat("Replacing value labels. This might take some time...\n") pb <- txtProgressBar(min=1,max=length(val.labels)+1) for (i in seq_along(val.labels)) { if(val.labels[i]!="") { labname <- val.labels[i] vartype <- types[i] labtable <- label[[labname]] varname <- names(val.labels)[i] # get old codes if(is.factor(dat[, varname])) { oldlabname <- get.label.name(dat, varname) oldlabtab <- get.label(dat, oldlabname) codes <- get.origin.codes(dat[,varname], oldlabtab) varunique <- na.omit(unique(codes)) } else { varunique <- na.omit(unique(dat[,varname])) } if(labname %in% names(label) & vartype > 65527 & is.factor(dat[,varname])) { # assign label if label set is complete if (all(varunique %in% labtable)) { dat[,varname] <- factor(codes, levels=labtable, labels=names(labtable)) } # else generate labels from codes } else if(generate.factors) { names(varunique) <- as.character(varunique) gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable)) dat[,varname] <- factor(dat[,varname], levels=gen.lab, labels=names(gen.lab)) } else { warning(paste(vnames[i], "Missing factor labels - no labels assigned. Set option generate.factors=T to generate labels.")) } setTxtProgressBar(pb, i) } } close(pb) # Save old default labels to expansion.fields. This is necessary to save # original labels for further use. vnames <- names(oldval.labels) names(oldval.labels) <- NULL tmp <- list() for (i in seq_along(val.labels)){ tmp[[i]] <- c(vnames[i],paste0("_lang_l_",oldlang), oldval.labels[i]) } attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp) # variable label old.varlabel <- attr(dat, "var.labels") tmp <- list() for (i in seq_along(old.varlabel)){ tmp[[i]] <- c(vnames[i],paste0("_lang_v_", oldlang), old.varlabel[i]) } attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp) ex <- attr(dat, "expansion.fields") varname <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[1]) varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3]) names(varlabel) <- varname varlabel.out <- as.character(varlabel[vnames]) varlabel.out[is.na(varlabel.out)] <- "" attr(dat, "var.labels") <- varlabel.out # set new default lang and store string as default attributes names(val.labels) <- NULL attr(dat, "val.labels") <- val.labels attr(dat, "expansion.fields")[[ grep("_lang_c", attr(dat, "expansion.fields")) ]][3] <- lang return(dat) } } #' Check if numeric vector can be expressed as interger vector #' #' Compression can reduce numeric vectors as integers if the vector does only #' contain integer type data. #' #' @param x vector of data frame saveToExport <- function(x) { isTRUE(all.equal(x, as.integer(x))) } #' Check max char length of data.frame vectors #' #' Stata requires us to provide the maximum size of a charactervector as every #' row is stored in a bit region of this size. #' #' Ex: If the max chars size is four, _ is no character in this vector: #' 1. row: four #' 3. row: one_ #' 4. row: ____ #' #' If a character vector contains only missings or is empty, we will assign it a #' value of one, since Stata otherwise cannot handle what we write. #' #' @param x vector of data frame maxchar <- function(x) { z <- max(nchar(x, type="byte"), na.rm = TRUE) # Stata does not allow storing a string of size 0 if(is.infinite(z) | (z == 0)) z <- 1 z } readstata13/R/save.R0000644000176200001440000003124513103030636013650 0ustar liggesusers# # Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . #' Write Stata Binary Files #' #' \code{save.dta13} writes a Stata dta-file bytewise and saves the data #' into a dta-file. #' #' @param file \emph{character.} Path to the dta file you want to export. #' @param data \emph{data.frame.} A data.frame Object. #' @param data.label \emph{character.} Name of the dta-file. #' @param time.stamp \emph{logical.} If \code{TRUE}, add a time.stamp to the #' dta-file. #' @param convert.factors \emph{logical.} If \code{TRUE}, factors will be #' converted to Stata variables with labels. #' Stata expects strings to be encoded as Windows-1252, so all levels will be #' recoded. Character which can not be mapped in Windows-1252 will be saved as #' hexcode. #' @param convert.dates \emph{logical.} If \code{TRUE}, dates will be converted #' to Stata date time format. Code from \code{foreign::write.dta} #' @param convert.underscore \emph{logical.} If \code{TRUE}, all non numerics or #' non alphabet characters will be converted to underscores. #' @param tz \emph{character.} The name of the timezone convert.dates will use. #' @param add.rownames \emph{logical.} If \code{TRUE}, a new variable rownames #' will be added to the dta-file. #' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will #' use all of Statas numeric-vartypes. #' @param version \emph{numeric.} Stata format for the resulting dta-file either #' the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14. #' @return The function writes a dta-file to disk. The following features of the #' dta file format are supported: #' \describe{ #' \item{datalabel:}{Dataset label} #' \item{time.stamp:}{Timestamp of file creation} #' \item{formats:}{Stata display formats. May be used with #' \code{\link[base]{sprintf}}} #' \item{type:}{Stata data type (see Stata Corp 2014)} #' \item{var.labels:}{Variable labels} #' \item{version:}{dta file format version} #' \item{strl:}{List of character vectors for the new strL string variable #' type. The first element is the identifier and the second element the #' string.} #' } #' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata #' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. #' @references Stata Corp (2014): Description of .dta file format #' \url{http://www.stata.com/help.cgi?dta} #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @useDynLib readstata13 #' @importFrom utils localeToCharset #' @export save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, convert.factors=TRUE, convert.dates=TRUE, tz="GMT", add.rownames=FALSE, compress=FALSE, version=117, convert.underscore=FALSE){ if (!is.data.frame(data)) stop("The object \"data\" must have class data.frame") if (!dir.exists13(dirname(file))) stop("Path is invalid. Possibly a non existend directory.") # Allow writing version as Stata version not Stata format if (version==14L) version <- 118 if (version==13L) version <- 117 if (version==12L) version <- 115 if (version==11L | version==10L) version <- 114 if (version==9L | version==8L) version <- 113 if (version==7) version <- 110 if (version==6) version <- 108 if (version<102 | version == 109 | version == 116 | version>118) stop("Version missmatch abort execution. No Data was saved.") sstr <- 2045 sstrl <- 32768 sdouble <- 65526 sfloat <- 65527 slong <- 65528 sint <- 65529 sbyte <- 65530 if (version < 117) { sstr <- 244 sstrl <- 244 sdouble <- 255 sfloat <- 254 slong <- 253 sint <- 252 sbyte <- 251 } if (version<111 | version==112) sstrl <- 80 if(!is.data.frame(data)) { stop("Object is not of class data.frame.") } # Is recoding necessary? if (version<=117) { # Reencoding is always needed doRecode <- TRUE toEncoding <- "CP1252" } else if (toupper(localeToCharset()[1])!="UTF-8") { # If R runs in a non UTF-8 locale and Stata > 13 doRecode <- TRUE toEncoding <- "UTF-8" } else { # utf-8 and Stata > 13 doRecode <- FALSE } if (add.rownames) { if (doRecode) { rwn <- save.encoding(rownames(data), toEncoding) } else { rwn <-rownames(data) } data <- data.frame(rownames= rwn, data, stringsAsFactors = F) } rownames(data) <- NULL if (convert.underscore) { names(data) <- gsub("[^a-zA-Z0-9_]", "_", names(data)) names(data)[grepl("^[0-9]", names(data))] <- paste0( "_", names(data)[grepl("^[0-9]", names(data))]) } filepath <- path.expand(file) # For now we handle numeric and integers vartypen <- sapply(data, class) names(vartypen) <- names(data) # Convert logicals to integers for (v in names(vartypen[vartypen == "logical"])) data[[v]] <- as.integer(data[[v]]) vartypen <- vtyp <- sapply(data, class) if (convert.factors){ if (version < 106) { hasfactors <- sapply(data, is.factor) if (any(hasfactors)) warning("dta-format < 106 does not handle factors. Labels are not saved!") } # If our data.frame contains factors, we create a label.table factors <- which(sapply(data, is.factor)) f.names <- attr(factors,"names") label.table <- vector("list", length(f.names)) names(label.table) <- f.names valLabel <- sapply(data, class) valLabel[valLabel != "factor"] <- "" i <- 0 for (v in factors) { i <- i + 1 if (doRecode) { f.levels <- save.encoding(levels(data[[v]]), toEncoding) } else { f.levels <- levels(data[[v]]) } f.labels <- as.integer(labels(levels(data[[v]]))) attr(f.labels, "names") <- f.levels f.labels <- f.labels[names(f.labels) != ".."] label.table[[ (f.names[i]) ]] <- f.labels valLabel[v] <- f.names[i] } attr(data, "label.table") <- rev(label.table) if (doRecode) { valLabel <- save.encoding(valLabel, toEncoding) } attr(data, "vallabels") <- valLabel } else { attr(data, "label.table") <- NULL attr(data, "vallabels") <- rep("",length(data)) } if (convert.dates) { dates <- which(sapply(data, function(x) inherits(x, "Date")) ) for (v in dates) data[[v]] <- as.vector( julian(data[[v]],as.Date("1960-1-1", tz = "GMT")) ) dates <- which( sapply(data, function(x) inherits(x,"POSIXt")) ) for (v in dates) data[[v]] <- as.vector( round(julian(data[[v]], ISOdate(1960, 1, 1, tz = tz))) ) } # is.numeric is TRUE for integers ff <- sapply(data, is.numeric) ii <- sapply(data, is.integer) factors <- sapply(data, is.factor) empty <- sapply(data, function(x) all(is.na(x) & !is.character(x))) ddates <- vartypen == "Date" # default no compression: numeric as double; integer as long; date as date; # empty as byte if (!compress) { vartypen[ff] <- sdouble vartypen[ii] <- slong vartypen[factors] <- slong vartypen[ddates] <- -sdouble vartypen[empty] <- sbyte } else { varTmin <- sapply(data[(ff | ii) & !empty], function(x) min(x,na.rm=TRUE)) varTmax <- sapply(data[(ff | ii) & !empty], function(x) max(x,na.rm=TRUE)) # check if numerics can be stored as integers numToCompress <- sapply(data[ff], saveToExport) if (any(numToCompress)) { saveToConvert <- names(ff[numToCompress]) # replace numerics as intergers data[saveToConvert] <- sapply(data[saveToConvert], as.integer) # recheck after update ff <- sapply(data, is.numeric) ii <- sapply(data, is.integer) } vartypen[ff] <- sdouble bmin <- -127; bmax <- 100 imin <- -32767; imax <- 32740 # check if integer is byte, int or long for (k in names(which(ii & !empty))) { vartypen[k][varTmin[k] < imin | varTmax[k] > imax] <- slong vartypen[k][varTmin[k] > imin & varTmax[k] < imax] <- sint vartypen[k][varTmin[k] > bmin & varTmax[k] < bmax] <- sbyte } factorlength <- sapply(data[factors & !empty], nlevels) for (k in names(which(factors & !empty))) { vartypen[factors & factorlength[k] > 0x1.000000p127] <- slong vartypen[factors & factorlength[k] < 0x1.000000p127] <- sint vartypen[factors & factorlength[k] < 101] <- sbyte } # keep dates as is vartypen[ddates] <- -sdouble # cast empty variables as byte vartypen[empty] <- sbyte } # recode character variables. 118 wants utf-8, so encoding may be required if(doRecode) { #TODO: use seq_len ? for(v in (1:ncol(data))[vartypen == "character"]) { data[, v] <- save.encoding(data[, v], toEncoding) } } # str and strL are stored by maximum length of chars in a variable str.length <- sapply(data[vartypen == "character"], FUN=maxchar) str.length[str.length > sstr] <- sstrl for (v in names(vartypen[vartypen == "character"])) { # str.length[str.length > sstr] <- sstrl # no loop necessary! vartypen[[v]] <- str.length[[v]] } # save type bevor abs() formats <- vartypen vartypen <- abs(as.integer(vartypen)) attr(data, "types") <- vartypen # ToDo: Add propper check. # # value_label_names must be < 33 chars # if (sapply(valLabel,FUN=maxchar) >= 33) # message ("at least one variable name is to long.") # Resize varnames to 32. Stata requires this. It allows storing 32*4 bytes, # but can not work with longer variable names. Chars can be 1 - 4 bytes we # count the varnames in R. Get nchars and trim them. varnames <- names(data) lenvarnames <- sapply(varnames, nchar) if (any (lenvarnames > 32) & version >= 117) { message ("Varname to long. Resizing. Max size is 32.") names(data) <- sapply(varnames, strtrim, width = 32) } # Stata format "%9,0g" means european format formats <- vartypen formats[vtyp == "Date"] <- "%td" formats[formats == sdouble] <- "%9.0g" formats[formats == sfloat] <- "%9.0g" formats[formats == slong] <- "%9.0g" formats[formats == sint] <- "%9.0g" formats[formats == sbyte] <- "%9.0g" formats[vartypen >= 0 & vartypen <= sstr] <- paste0("%", formats[vartypen >= 0 & vartypen <= sstr], "s") attr(data, "formats") <- formats # Create a datalabel if (is.null(data.label)) { attr(data, "datalabel") <- "Written by R" } else { if (version == 102L) warning("Format 102 does not print a data label in Stata.") if (doRecode) { data.label <- save.encoding(data.label, toEncoding) } attr(data, "datalabel") <- data.label } # Create the 17 char long timestamp. It may contain 17 char long strings if (!time.stamp) { attr(data, "timestamp") <- "" } else { lct <- Sys.getlocale("LC_TIME"); Sys.setlocale("LC_TIME", "C") attr(data, "timestamp") <- format(Sys.time(), "%d %b %Y %H:%M") Sys.setlocale("LC_TIME",lct) } expfield <- attr(data, "expansion.fields") if (doRecode) { expfield <- lapply(expfield, function(x) iconv(x, to=toEncoding)) } attr(data, "expansion.fields") <- rev(expfield) attr(data, "version") <- as.character(version) if (version < 117) attr(data, "version") <- version # If length of varlabels differs from ncols drop varlabels. This can happen, # when the initial data.frame was read by read.dta13 and another variable was # attached. In this case the last variable label has a non existing variable # label which will crash our Rcpp code. Since varlabels do not respect the # ordering inside the data frame, we simply drop them. varlabels <- attr(data, "var.labels") if (!is.null(varlabels) & (length(varlabels)!=ncol(data))) { attr(data, "var.labels") <- NULL warning("Number of variable labels does not match number of variables. Variable labels dropped.") } if (version >= 117) invisible( stata_save(filePath = filepath, dat = data) ) else invisible( stata_pre13_save(filePath = filepath, dat = data) ) } readstata13/R/read.R0000644000176200001440000003571313103051157013632 0ustar liggesusers# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki # Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . #' Read Stata Binary Files #' #' \code{read.dta13} reads a Stata dta-file and imports the data into a #' data.frame. #' #' @param file \emph{character.} Path to the dta file you want to import. #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata #' value labels are created. #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is #' TRUE, missing factor labels are created from integers. If duplicated labels are found, #' unique labels will be generated according the following scheme: "label_(integer code)". #' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8 #' to system encoding. Options are "latin1" or "UTF-8" to specify target #' encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings #' which can't be displayed in the current locale. #' Set encoding=NULL to stop reencoding. #' @param fromEncoding \emph{character.} We expect strings to be encoded as #' "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14 #' or newer "UTF-8" is used. In some situation the used encoding can differ for #' Stata 14 files and must be manually set. #' @param convert.underscore \emph{logical.} If \code{TRUE}, "_" in variable #' names will be changed to "." #' @param missing.type \emph{logical.} Stata knows 27 different missing types: #' ., .a, .b, ..., .z. If \code{TRUE}, attribute \code{missing} will be #' created. #' @param replace.strl \emph{logical.} If \code{TRUE}, replace the reference to #' a strL string in the data.frame with the actual value. The strl attribute #' will be removed from the data.frame (see details). #' @param convert.dates \emph{logical.} If \code{TRUE}, Stata dates are #' converted. #' @param add.rownames \emph{logical.} If \code{TRUE}, the first column will be #' used as rownames. Variable will be dropped afterwards. #' @param nonint.factors \emph{logical.} If \code{TRUE}, factors labels #' will be assigned to variables of type float and double. #' @param select.rows \emph{integer.} Vector of one or two numbers. If single #' value rows from 1:val are selected. If two values of a range are selected #' the rows in range will be selected. #' #' @details If the filename is a url, the file will be downloaded as a temporary #' file and read afterwards. #' #' Stata files are encoded in ansinew. Depending on your system's default encoding #' certain characters may appear wrong. Using a correct encoding may fix these. #' #' Variable names stored in the dta-file will be used in the resulting #' data.frame. Stata types char, byte, and int will become integer; float and #' double will become numerics. R only knows a single missing type, while Stata #' knows 27, so all Stata missings will become NA in R. If you need to keep #' track of Statas original missing types, you may use #' \code{missing.type=TRUE}. #' #' Stata dates are converted to R's Date class the same way foreign handles #' dates. #' #' Stata 13 introduced a new character type called strL. strLs are able to store #' strings up to 2 billion characters. While R is able to store #' strings of this size in a character vector, the printed representation of such #' vectors looks rather cluttered, so it's possible to save only a reference in the #' data.frame with option \code{replace.strl=FALSE}. #' #' In R, you may use rownames to store characters (see for instance #' \code{data(swiss)}). In Stata, this is not possible and rownames have to be #' stored as a variable. If you want to use rownames, set add.rownames to TRUE. #' Then the first variable of the dta-file will hold the rownames of the resulting #' data.frame. #' #' Reading dta-files of older and newer versions than 13 was introduced #' with version 0.8. #' @return The function returns a data.frame with attributes. The attributes #' include #' \describe{ #' \item{datalabel:}{Dataset label} #' \item{time.stamp:}{Timestamp of file creation} #' \item{formats:}{Stata display formats. May be used with #' \code{\link{sprintf}}} #' \item{types:}{Stata data type (see Stata Corp 2014)} #' \item{val.labels:}{For each variable the name of the associated value #' labels in "label"} #' \item{var.labels:}{Variable labels} #' \item{version:}{dta file format version} #' \item{label.table:}{List of value labels.} #' \item{strl:}{Character vector with long strings for the new strl string variable #' type. The name of every element is the identifier.} #' \item{expansion.fields:}{list providing variable name, characteristic name #' and the contents of Stata characteristic field.} #' \item{missing:}{List of numeric vectors with Stata missing type for each #' variable.} #' } #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members #' from foreign::read.dta(). #' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata #' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. #' @references Stata Corp (2014): Description of .dta file format #' \url{http://www.stata.com/help.cgi?dta} #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @useDynLib readstata13 #' @importFrom utils download.file #' @importFrom stats na.omit #' @export read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, encoding = "UTF-8", fromEncoding=NULL, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, add.rownames = FALSE, nonint.factors=FALSE, select.rows = NULL) { # Check if path is a url if (length(grep("^(http|ftp|https)://", file))) { tmp <- tempfile() download.file(file, tmp, quiet = TRUE, mode = "wb") filepath <- tmp on.exit(unlink(filepath)) } else { # construct filepath and read file filepath <- get.filepath(file) } if (!file.exists(filepath)) return(message("File not found.")) # some select.row checks if (!is.null(select.rows)) { # check that it is a numeric if (!is.numeric(select.rows)){ return(message("select.rows must be of type numeric")) } else { # guard against negative values if (any(select.rows < 0) ) select.rows <- abs(select.rows) # check that lenght is not > 2 if (length(select.rows) > 2) return(message("select.rows must be of length 1 or 2.")) # if lenght 1 start at row 1 if (length(select.rows) == 1) select.rows <- c(1, select.rows) } # reorder if 2 is bigger than 1 if (select.rows[2] < select.rows[1]) select.rows <- c(select.rows[2], select.rows[1]) # make sure to start at index position 1 if select.rows[2] > 0 if (select.rows[2] > 0 & select.rows[1] == 0) select.rows[1] <- 1 } else { # set a value select.rows <- c(0,0) } data <- stata_read(filepath, missing.type, select.rows) version <- attr(data, "version") sstr <- 2045 sstrl <- 32768 sdouble <- 65526 sfloat <- 65527 slong <- 65528 sint <- 65529 sbyte <- 65530 if (version < 117) { sstr <- 244 sstrl <- 255 sdouble <- 255 sfloat <- 254 slong <- 253 sint <- 252 sbyte <- 251 } if (convert.underscore) names(data) <- gsub("_", ".", names(data)) types <- attr(data, "types") val.labels <- attr(data, "val.labels") label <- attr(data, "label.table") if (missing.type) { stata.na <- data.frame(type = sdouble:sbyte, min = c(101, 32741, 2147483621, 2 ^ 127, 2 ^ 1023), inc = c(1, 1, 1, 2 ^ 115, 2 ^ 1011) ) if (version >= 113L & version < 117L) { missings <- vector("list", length(data)) names(missings) <- names(data) for (v in which(types > 250L)) { this.type <- types[v] - 250L nas <- is.na(data[[v]]) | data[[v]] >= stata.na$min[this.type] natype <- (data[[v]][nas] - stata.na$min[this.type])/ stata.na$inc[this.type] natype[is.na(natype)] <- 0L missings[[v]] <- rep(NA, NROW(data)) missings[[v]][nas] <- natype data[[v]][nas] <- NA } attr(data, "missing") <- missings } else { if (version >= 117L) { missings <- vector("list", length(data)) names(missings) <- names(data) for (v in which(types > 65525L)) { this.type <- 65531L - types[v] nas <- is.na(data[[v]]) | data[[v]] >= stata.na$min[this.type] natype <- (data[[v]][nas] - stata.na$min[this.type]) / stata.na$inc[this.type] natype[is.na(natype)] <- 0L missings[[v]] <- rep(NA, NROW(data)) missings[[v]][nas] <- natype data[[v]][nas] <- NA } attr(data, "missing") <- missings } else warning("'missing.type' only applicable to version >= 8 files") } } var.labels <- attr(data, "var.labels") ## Encoding if(!is.null(encoding)) { # set from encoding by dta version if(is.null(fromEncoding)) { fromEncoding <- "CP1252" if(attr(data, "version") >= 118L) fromEncoding <- "UTF-8" } # varnames names(data) <- read.encoding(names(data), fromEncoding, encoding) # var.labels attr(data, "var.labels") <- read.encoding(var.labels, fromEncoding, encoding) # val.labels names(val.labels) <- read.encoding(val.labels, fromEncoding, encoding) attr(data, "val.labels") <- val.labels # label names(label) <- read.encoding(names(label), fromEncoding, encoding) if (length(label) > 0) { for (i in 1:length(label)) { names(label[[i]]) <- read.encoding(names(label[[i]]), fromEncoding, encoding) } attr(data, "label.table") <- label } # recode character variables for (v in (1:ncol(data))[types <= sstr]) { data[, v] <- iconv(data[, v], from=fromEncoding, to=encoding, sub="byte") } # expansion.field efi <- attr(data, "expansion.fields") if (length(efi) > 0) { efiChar <- unlist(lapply(efi, is.character)) for (i in (1:length(efi))[efiChar]) { efi[[i]] <- read.encoding(efi[[i]], fromEncoding, encoding) } attr(data, "expansion.fields") <- efi } if (version >= 117L) { #strl strl <- attr(data, "strl") if (length(strl) > 0) { for (i in 1:length(strl)) { strl[[i]] <- read.encoding(strl[[i]], fromEncoding, encoding) } attr(data, "strl") <- strl } } } var.labels <- attr(data, "var.labels") if (replace.strl & version >= 117L) { strl <- c("") names(strl) <- "00000000000000000000" strl <- c(strl, attr(data,"strl")) for (j in seq(ncol(data))[types == 32768] ) { data[, j] <- strl[data[,j]] } # if strls are in data.frame remove attribute strl attr(data, "strl") <- NULL } if (convert.dates) { convert_dt_c <- function(x) as.POSIXct((x + 0.1) / 1000, origin = "1960-01-01") # avoid rounding down convert_dt_C <- function(x) { ls <- .leap.seconds + seq_along(.leap.seconds) + 315619200 z <- (x + 0.1) / 1000 # avoid rounding down z <- z - rowSums(outer(z, ls, ">=")) as.POSIXct(z, origin = "1960-01-01") } ff <- attr(data, "formats") ## dates <- grep("%-*d", ff) ## Stata 12 introduced 'business dates' ## 'Formats beginning with %t or %-t are Stata's date and time formats.' ## but it seems some are earlier. ## The dta_115 description suggests this is too inclusive: ## 'Stata has an old *%d* format notation and some datasets ## still have them. Format *%d*... is equivalent to modern ## format *%td*... and *%-d*... is equivalent to *%-td*...' dates <- grep("^%(-|)(d|td)", ff) ## avoid as.Date in case strptime is messed up base <- structure(-3653L, class = "Date") # Stata dates are integer vars for (v in dates) data[[v]] <- structure(base + data[[v]], class = "Date") for (v in grep("%tc", ff)) data[[v]] <- convert_dt_c(data[[v]]) for (v in grep("%tC", ff)) data[[v]] <- convert_dt_C(data[[v]]) } if (convert.factors) { vnames <- names(data) for (i in seq_along(val.labels)) { labname <- val.labels[i] vartype <- types[i] labtable <- label[[labname]] #don't convert columns of type double or float to factor if (labname %in% names(label)) { if((vartype == sdouble | vartype == sfloat)) { if(!nonint.factors) { warning(paste0("\n ",vnames[i], ":\n Factor codes of type double or float detected - no labels assigned.\n Set option nonint.factors to TRUE to assign labels anyway.\n")) next } } # get unique values / omit NA varunique <- na.omit(unique(data[, i])) # assign label if label set is complete if (all(varunique %in% labtable)) { #check for duplicated labels labcount <- table(names(labtable)) if(any(labcount > 1)) { warning(paste0("\n ",vnames[i], ":\n Duplicated factor levels detected - generating unique labels.\n")) labdups <- names(labtable) %in% names(labcount[labcount > 1]) # generate unique labels from assigned label and code number names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")") } data[, i] <- factor(data[, i], levels=labtable, labels=names(labtable)) # else generate labels from codes } else if (generate.factors) { names(varunique) <- as.character(varunique) gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable)) data[, i] <- factor(data[, i], levels=gen.lab, labels=names(gen.lab)) } else { warning(paste0("\n ",vnames[i], ":\n Missing factor labels - no labels assigned.\n Set option generate.factors=T to generate labels.")) } } } } if (add.rownames) { rownames(data) <- data[[1]] data[[1]] <- NULL } return(data) } readstata13/R/dbcal.R0000644000176200001440000001441113077075424013771 0ustar liggesusers# # Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . #' Parse Stata business calendar files #' #' Create conversion table for business calendar dates. #' #' @param stbcalfile \emph{stbcal-file} Stata buisness calendar file created by #' Stata. #' @return Returns a data.frame with two cols: #' \describe{ #' \item{range:}{The date matching the buisnesdate. Date format.} #' \item{buisdays:}{The Stata business calendar day. Integer format.} #' } #' @details Stata 12 introduced business calender format. Business dates are #' integer numbers in a certain range of days, weeks, months or years. In this #' range some days are omitted (e.g. weekends or holidays). If a business #' calendar was created, a stbcal file matching this calendar was created. This #' file is required to read the business calendar. This parser reads the stbcal- #' file and returns a data.frame with dates matching business calendar dates. #' #' A dta-file containing Stata business dates imported with read.stata13() shows #' in formats which stdcal file is required (e.g. "%tbsp500" requires #' sp500.stbcal). #' #' Stata allows adding a short description called purpose. This is added as an #' attribute of the resulting data.frame. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @examples #' sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) #' @importFrom stats complete.cases #' @export stbcal <- function(stbcalfile) { # Otherwise localised dates will be used. lct <- Sys.getlocale("LC_TIME"); Sys.setlocale("LC_TIME", "C") # Parse full file stbcal <- file(stbcalfile, "rb") x <- readLines(stbcal, file.info(stbcalfile)$size) close(stbcal) # Dateformat can be ymd, ydm, myd, mdy, dym or dmy if(any(grepl("dateformat ymd", x))) dateformat <- "%Y%b%d" if(any(grepl("dateformat ydm", x))) dateformat <- "%Y%d%b" if(any(grepl("dateformat myd", x))) dateformat <- "%b%Y%d" if(any(grepl("dateformat mdy", x))) dateformat <- "%b%d%Y" if(any(grepl("dateformat dym", x))) dateformat <- "%b%Y%d" if(any(grepl("dateformat dmy", x))) dateformat <- "%d%b%Y" # Range of stbcal. Range is required, contains start and end. rangepos <- grep("range", x) range <- x[rangepos] range <- strsplit(range, " ") rangestart <- range[[1]][2] rangestop <- range[[1]][3] range <- seq(from= as.Date(rangestart, dateformat), to= as.Date(rangestop, dateformat), "days") # Centerdate of stbcal. Date that matches 0. centerpos <- grep("centerdate", x) centerdate <- x[centerpos] centerdate <- gsub("centerdate ","",centerdate) centerdate <- as.Date(centerdate, dateformat) # Omit Dayofweek omitdayofweekpos <- grep ("omit dayofweek", x) omitdayofweek <- x[omitdayofweekpos] # Mo, Tu, We, Th, Fr, Sa, Su daysofweek <- weekdays(as.Date(range)) stbcal <- data.frame(range = range, daysofweek=daysofweek) # Weekdays every week if (any(grepl("Mo", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Monday"] <- NA if (any(grepl("Tu", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Tuesday"] <- NA if (any(grepl("We", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Wednesday"] <- NA if (any(grepl("Th", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Thursday"] <- NA if (any(grepl("Fr", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Friday"] <- NA if (any(grepl("Sa", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Saturday"] <- NA if (any(grepl("Su", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Sunday"] <- NA # Special days to be omitted if (any(grepl("omit date", x))) { dates <- grep("omit date", x) omitdates <- x[dates] omitdates <- gsub("omit date ", "", omitdates) dates <- as.Date(omitdates, dateformat) stbcal$daysofweek[which(stbcal$range%in%dates)] <- NA # Keep only wanted days stbcal$daysofweek behalten stbcal <- stbcal[complete.cases(stbcal$daysofweek),] } # In case centerdate is not rangestart: stbcal$buisdays <- NA stbcal$buisdays[stbcal$range==centerdate] <- 0 stbcal$buisdays[stbcal$rangecenterdate] <- seq(from=1, to=length(stbcal$range[stbcal$range>centerdate])) # Add purpose if (any(grepl("purpose", x))) { purposepos <- grep("purpose", x) purpose <- x[purposepos] attr(stbcal, "purpose") <- purpose } # restore locale Sys.setlocale("LC_TIME", lct) return(stbcal) } #' Convert Stata business calendar dates in readable dates. #' #' Convert Stata business calendar dates in readable dates. #' #' @param buisdays numeric Vector of business dates #' @param cal data.frame Conversion table for business calendar dates #' @param format character String with date format as in \code{\link{as.Date}} #' @return Returns a vector of readable dates. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @examples #' # read business calendar and data #' sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' #' # convert dates and check #' dat$ldatescal2 <- as.caldays(dat$ldate, sp500) #' all(dat$ldatescal2==dat$ldatescal) #' @export as.caldays <- function(buisdays, cal, format="%Y-%m-%d") { rownames(cal) <- cal$buisdays dates <- cal[as.character(buisdays), "range"] if(!is.null(format)) as.Date(dates, format = format) return(dates) } readstata13/README.md0000644000176200001440000001055713103037350013650 0ustar liggesusers# readstata13 Package to read and write all Stata file formats (version 14 and older) into a R data.frame. The dta file format versions 102 to 118 are supported. The function ```read.dta``` from the foreign package imports only dta files from Stata versions <= 12. Due to the different structure and features of dta 117 files, we wrote a new file reader in Rcpp. Additionally the package supports many features of the Stata dta format like label sets in different languages (`?set.lang`) or business calendars (`?as.caldays`). ## Installation The package is now hosted on CRAN. ```R install.packages("readstata13") ``` ## Usage ```R library(readstata13) dat <- read.dta13("path to file.dta") save.dta13(dat, file="newfile.dta") ``` ## Development Version To install the current release from github you need the plattform specific build tools. On Windows a current installation of [Rtools](https://cran.r-project.org/bin/windows/Rtools/) is necessary, while OS X users need to install [Xcode](https://itunes.apple.com/us/app/xcode/id497799835). ```R # install.packages("devtools") devtools::install_github("sjewo/readstata13", ref="0.9.0") ``` Older Versions of devtools require a username option: ```R install_github("readstata13", username="sjewo", ref="0.9.0") ``` To install the current development version from github: ```R devtools::install_github("sjewo/readstata13", ref="testing") ``` ## Current Status [![Build Status](https://travis-ci.org/sjewo/readstata13.svg?branch=master)](https://travis-ci.org/sjewo/readstata13) [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13) ### Working features * [0.9.0] Generate unique factor labels to prevent errors in factor definition * [0.9.0] check interrupt for long read. Patch by Giovanni Righi * [0.9.0] updates to notes, roxygen and register * [0.9.0] fixed size of character length. Bug reported by Yiming (Paul) Li * [0.9.0] fix saving characters containing missings. Bug reported by Eivind H. Olsen * [0.9.0] adjustments to convert.underscore. Patch by luke-m-olson * [0.9.0] alow partial reading of selected rows * [0.8.5] fix errors on big-endians systems * [0.8.4] fix valgrind errors. converting from dta.write to writestr * [0.8.4] fix for empty data label * [0.8.4] make replace.strl default * [0.8.3] restrict length of varnames to 32 chars for compatibility with Stata 14 * [0.8.3] add many function tests * [0.8.3] avoid converting of double to floats while writing compressed files * [0.8.2] save NA values in character vector as empty string * [0.8.2] convert.underscore=T will convert all non-literal characters to underscores * [0.8.2] fix saving of Dates * [0.8.2] save with convert.factors by default * [0.8.2] test for NaN and inf values while writing missing values and replace with NA * [0.8.2] remove message about saving factors * [0.8.1] convert non-integer variables to factors (```nonint.factors=T```) * [0.8.1] handle large datasets * [0.8.1] working with strL variables is now a lot faster * reading data files from disk or url and create a data.frame * saving dta files to disk - most features of the dta file format are supported * assign variable names * read the new strL strings and save them as attribute * convert stata label to factors and save them as attribute * read some meta data (timestamp, dataset label, formats,...) * convert strings to system encoding * handle different NA values * handle multiple label languages * convert dates * reading business calendar files ### Todo * cleanup of Rcpp code ### Test Since our attributes differ from foreign::read.dta all.equal and identical report false. If you check the values, everything is identical. ```R library("foreign") r12 <- read.dta("http://www.stata-press.com/data/r12/auto.dta") r13 <- read.dta13("http://www.stata-press.com/data/r13/auto.dta") Map(identical,r12,r13) att <- names(attributes(r12)) for (i in seq(att)) cat(att[i],":", all.equal(attr(r12,att[i]),attr(r13,att[i])),"\n") r12 <- read.dta("http://www.stata-press.com/data/r12/auto.dta",convert.factors=F) r13 <- read.dta13("http://www.stata-press.com/data/r13/auto.dta",convert.factors=F) Map(identical,r12,r13) ``` ## Authors [Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de) (both Ruhr-Universität Bochum) ## Licence GPL2 readstata13/MD50000644000176200001440000000616613103134150012676 0ustar liggesusers21070e17138c58531d8346b9b63902ea *DESCRIPTION e8c1458438ead3c34974bc0be3a03ed6 *LICENSE b3ae712c9babfa659c890e1a0a81ddc6 *NAMESPACE 6119935c2908f54ee56b42be6dc3a667 *NEWS 406e1e232e2738575d0042fcc8af6663 *R/RcppExports.R ac3002973d28f7d5c3fd9311dc1883c3 *R/dbcal.R 2bf8a8f14a103896a1e508d87fe73ad8 *R/read.R b855a3b100628fde9eb4adb9f8817c26 *R/readstata13.R 424ae7f8220cebe36b7056536260862d *R/save.R 94eb038c59d994a2b265ed3bf4a9f99b *R/tools.R 7384c1cbd8ec7033396619b21cfe7cb2 *README.md a885e4f610350825892c92d3ca858889 *inst/extdata/encode.do 1165031bfee6c9e6ce501baa24e3a7f1 *inst/extdata/encode.dta 23c478f4b7d45b7aabcc48a0f5795480 *inst/extdata/encodecp.dta b9463f13d2e57b2d0ee028368eefcd29 *inst/extdata/gen_fac.do 1530f9cdf1f80c39158ea8d249e19af0 *inst/extdata/gen_fac.dta d6127dcadbd1316ee9dafd18420f01b1 *inst/extdata/missings.do dcd880aca64cc264c0ba20ee9b8d1510 *inst/extdata/missings.dta d66c8a83373c17ab2098ca07b975a97e *inst/extdata/missings_lsf.dta 36d795506440d058f7506aa0a7b70989 *inst/extdata/missings_msf.dta 8204563fbdff2e7ee74951eb894c6154 *inst/extdata/nonint.do ed8842275b4ba33858fe0822ff3f178e *inst/extdata/nonint.dta 295396a1a55b4326d89d2c2a86e90441 *inst/extdata/sp500.stbcal 389e33d907d10ec8efe41250f99221ab *inst/extdata/statacar.do f899f302225e099f83de7ac42f0623f2 *inst/extdata/statacar.dta a4248360860c7223c04f2bda517994fd *inst/extdata/test.zip 1e29776eed16f780a9beee2d11ada4d4 *inst/extdata/underscore.do 18d63a094394dd93f3b4363fcd09f322 *inst/extdata/underscore.dta 21ee394a9e4bb76bad9db76f235fe484 *inst/include/read_dta.h ee59a72937fa065afa154d196391cc6b *inst/include/read_pre13_dta.h e6c512611ed69dfafe91c3d917a216f7 *inst/include/readstata.h efbb4bc304a72b9177d1fe80bd312cad *inst/include/statadefines.h fc806a4ead84a5b3c6bb4f00af91ebf3 *inst/include/swap_endian.h 3e936e81cffb62a119785e96d210b1e9 *man/as.caldays.Rd ad5c66b56a915af320cfd148f6821c0a *man/get.label.Rd 6cd661d4be412f74faf922824d53e963 *man/get.label.name.Rd d5168142f1ab6dca7bd4c49eb7d86b73 *man/get.lang.Rd 3e1150ec98a4819b08daaa669b06d48d *man/get.origin.codes.Rd f3c2ac88ad9ea19659f1d7c35f3d0ac9 *man/maxchar.Rd 867b52582947d1ab21feaa9ee46109fc *man/read.dta13.Rd bf5e59df39b6f50b5fadbba97858e9e8 *man/readstata13.Rd 4477a84b56c50fc6a6292945aac8d56c *man/save.dta13.Rd bc27b06c1c1e566f8c3bcb49eadd61b6 *man/saveToExport.Rd a3313bcd94aecbb7e790728deb1042b9 *man/set.label.Rd 8b6b2575244f59c0e49fe29d9bfc9bb2 *man/set.lang.Rd ec29e8c38f6333f0f2ce706a95acff83 *man/stbcal.Rd 7d51e553b01ca610cd927f1abbb07cd9 *man/varlabel.Rd 5a37728c526310cfca2804ea6c29fb51 *src/Makevars 5a37728c526310cfca2804ea6c29fb51 *src/Makevars.win 95d90ed905996a8775364b2dd8d773d1 *src/RcppExports.cpp cf4edf15e68ffd8ed2dae8b6c53e1abb *src/rcpp_pre13_savestata.cpp 37acd24c73e8494add7bdfb47f4c8f7f *src/rcpp_readstata.cpp 96dc7840bc3d90fc9f49fa9962460554 *src/rcpp_savestata.cpp aaf1f8f5c9a0231a6c89064a34606805 *src/read_dta.cpp 4808de52efbdbfc5e189377eaeae299a *src/read_pre13_dta.cpp 6981790aad70b87c588ba8ff5c84bf82 *src/register.c 4dd91c288ce11a342d68442481e65e8b *tests/testthat.R 8b479881afa4359599b788f54820da51 *tests/testthat/test_read.R 63e1049b8b51af74684b8965d348e345 *tests/testthat/test_save.R readstata13/DESCRIPTION0000644000176200001440000000220113103134150014056 0ustar liggesusersPackage: readstata13 Type: Package Title: Import 'Stata' Data Files Version: 0.9.0 Authors@R: c( person("Jan Marvin", "Garbuszus", email = "jan.garbuszus@ruhr-uni-bochum.de", role = c("aut")), person("Sebastian", "Jeworutzki", email="Sebastian.Jeworutzki@ruhr-uni-bochum.de", role = c("aut", "cre")), person("R Core Team", role="cph"), person("Magnus Thor", "Torfason", role="ctb"), person("Luke M.", "Olson", role="ctb"), person("Giovanni", "Righi", role="ctb") ) Description: Function to read and write the 'Stata' file format. URL: https://github.com/sjewo/readstata13 BugReports: https://github.com/sjewo/readstata13/issues License: GPL-2 | file LICENSE Imports: Rcpp (>= 0.11.5) LinkingTo: Rcpp ByteCompile: yes Suggests: testthat RoxygenNote: 6.0.1 NeedsCompilation: yes Packaged: 2017-05-05 10:31:16 UTC; sj Author: Jan Marvin Garbuszus [aut], Sebastian Jeworutzki [aut, cre], R Core Team [cph], Magnus Thor Torfason [ctb], Luke M. Olson [ctb], Giovanni Righi [ctb] Maintainer: Sebastian Jeworutzki Repository: CRAN Date/Publication: 2017-05-05 17:37:44 UTC readstata13/man/0000755000176200001440000000000013103030636013134 5ustar liggesusersreadstata13/man/readstata13.Rd0000644000176200001440000000113013077075424015550 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/readstata13.R \docType{package} \name{readstata13} \alias{readstata13} \alias{readstata13-package} \title{Import Stata Data Files} \description{ Function to read the Stata file format into a data.frame. } \note{ If you catch a bug, please do not sue us, we do not have any money. } \seealso{ \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata Versions < 13 } \author{ Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/saveToExport.Rd0000644000176200001440000000060013077075424016100 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{saveToExport} \alias{saveToExport} \title{Check if numeric vector can be expressed as interger vector} \usage{ saveToExport(x) } \arguments{ \item{x}{vector of data frame} } \description{ Compression can reduce numeric vectors as integers if the vector does only contain integer type data. } readstata13/man/get.label.name.Rd0000644000176200001440000000171113077075424016215 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.label.name} \alias{get.label.name} \title{Get Names of Stata Label Set} \usage{ get.label.name(dat, var.name = NULL, lang = NA) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{var.name}{\emph{character vector.} Variable names. If \code{NULL}, get names of all label sets.} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} } \value{ Returns an named vector of variable labels } \description{ Retrieves the Stata label set in the dataset for all or an vector of variable names. } \details{ Stata stores factor labels in variable independent labels sets. This function retrieves the name of the label set for a variable. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/save.dta13.Rd0000644000176200001440000000537313103030636015304 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/save.R \name{save.dta13} \alias{save.dta13} \title{Write Stata Binary Files} \usage{ save.dta13(data, file, data.label = NULL, time.stamp = TRUE, convert.factors = TRUE, convert.dates = TRUE, tz = "GMT", add.rownames = FALSE, compress = FALSE, version = 117, convert.underscore = FALSE) } \arguments{ \item{data}{\emph{data.frame.} A data.frame Object.} \item{file}{\emph{character.} Path to the dta file you want to export.} \item{data.label}{\emph{character.} Name of the dta-file.} \item{time.stamp}{\emph{logical.} If \code{TRUE}, add a time.stamp to the dta-file.} \item{convert.factors}{\emph{logical.} If \code{TRUE}, factors will be converted to Stata variables with labels. Stata expects strings to be encoded as Windows-1252, so all levels will be recoded. Character which can not be mapped in Windows-1252 will be saved as hexcode.} \item{convert.dates}{\emph{logical.} If \code{TRUE}, dates will be converted to Stata date time format. Code from \code{foreign::write.dta}} \item{tz}{\emph{character.} The name of the timezone convert.dates will use.} \item{add.rownames}{\emph{logical.} If \code{TRUE}, a new variable rownames will be added to the dta-file.} \item{compress}{\emph{logical.} If \code{TRUE}, the resulting dta-file will use all of Statas numeric-vartypes.} \item{version}{\emph{numeric.} Stata format for the resulting dta-file either the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14.} \item{convert.underscore}{\emph{logical.} If \code{TRUE}, all non numerics or non alphabet characters will be converted to underscores.} } \value{ The function writes a dta-file to disk. The following features of the dta file format are supported: \describe{ \item{datalabel:}{Dataset label} \item{time.stamp:}{Timestamp of file creation} \item{formats:}{Stata display formats. May be used with \code{\link[base]{sprintf}}} \item{type:}{Stata data type (see Stata Corp 2014)} \item{var.labels:}{Variable labels} \item{version:}{dta file format version} \item{strl:}{List of character vectors for the new strL string variable type. The first element is the identifier and the second element the string.} } } \description{ \code{save.dta13} writes a Stata dta-file bytewise and saves the data into a dta-file. } \references{ Stata Corp (2014): Description of .dta file format \url{http://www.stata.com/help.cgi?dta} } \seealso{ \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/as.caldays.Rd0000644000176200001440000000200713077075424015462 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dbcal.R \name{as.caldays} \alias{as.caldays} \title{Convert Stata business calendar dates in readable dates.} \usage{ as.caldays(buisdays, cal, format = "\%Y-\%m-\%d") } \arguments{ \item{buisdays}{numeric Vector of business dates} \item{cal}{data.frame Conversion table for business calendar dates} \item{format}{character String with date format as in \code{\link{as.Date}}} } \value{ Returns a vector of readable dates. } \description{ Convert Stata business calendar dates in readable dates. } \examples{ # read business calendar and data sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) # convert dates and check dat$ldatescal2 <- as.caldays(dat$ldate, sp500) all(dat$ldatescal2==dat$ldatescal) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/maxchar.Rd0000644000176200001440000000117313103030636015050 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{maxchar} \alias{maxchar} \title{Check max char length of data.frame vectors} \usage{ maxchar(x) } \arguments{ \item{x}{vector of data frame} } \description{ Stata requires us to provide the maximum size of a charactervector as every row is stored in a bit region of this size. } \details{ Ex: If the max chars size is four, _ is no character in this vector: 1. row: four 3. row: one_ 4. row: ____ If a character vector contains only missings or is empty, we will assign it a value of one, since Stata otherwise cannot handle what we write. } readstata13/man/varlabel.Rd0000644000176200001440000000160113077075424015227 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{varlabel} \alias{varlabel} \alias{varlabel} \alias{varlabel<-} \title{Get and assign Stata Variable Labels} \usage{ varlabel(dat, var.name = NULL, lang = NA) varlabel(dat) <- value } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{var.name}{\emph{character vector.} Variable names. If NULL, get label for all variables.} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} \item{value}{\emph{character vector.} Vector of variable names.} } \value{ Returns an named vector of variable labels } \description{ Retrieve or set variable labels for a dataset. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/get.origin.codes.Rd0000644000176200001440000000205613077075424016605 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.origin.codes} \alias{get.origin.codes} \title{Get Origin Code Numbers for Factors} \usage{ get.origin.codes(x, label.table) } \arguments{ \item{x}{\emph{factor.} Factor to obtain code for} \item{label.table}{\emph{table.} Table with factor levels obtained by \code{\link{get.label}}.} } \value{ Returns an integer with original codes } \description{ Recreates the code numbers of a factor as stored in the Stata dataset. } \details{ While converting numeric variables into factors, the original code numbers are lost. This function reconstructs the codes from the attribute \code{label.table}. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) labname <- get.label.name(dat,"type") labtab <- get.label(dat, labname) # comparsion get.origin.codes(dat$type, labtab) as.integer(dat$type) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/get.lang.Rd0000644000176200001440000000166113077075424015144 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.lang} \alias{get.lang} \title{Show Default Label Language} \usage{ get.lang(dat, print = T) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{print}{\emph{logical.} If \code{TRUE}, print available languages and default language.} } \value{ Returns a list with two components: \describe{ \item{languages:}{Vector of label languages used in the dataset} \item{default:}{Name of the actual default label language, otherwise NA} } } \description{ Displays informations about the defined label languages. } \details{ Stata allows to define multiple label sets in different languages. This functions reports the available languages and the selected default language. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/get.label.Rd0000644000176200001440000000167713077075424015311 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.label} \alias{get.label} \title{Get Stata Label Table for a Label Set} \usage{ get.label(dat, label.name) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{label.name}{\emph{character.} Name of the Stata label set} } \value{ Returns a named vector of code numbers } \description{ Retrieve the value labels for a specific Stata label set. } \details{ This function returns the table of factor levels which represent a Stata label set. The name of a label set for a variable can be obtained by \code{\link{get.label.name}}. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) labname <- get.label.name(dat,"type") get.label(dat, labname) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/set.label.Rd0000644000176200001440000000163313103036763015306 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{set.label} \alias{set.label} \title{Assign Stata Labels to a Variable} \usage{ set.label(dat, var.name, lang = NA) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{var.name}{\emph{character.} Name of the variable in the data.frame} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} } \value{ Returns a labeled factor } \description{ Assign value labels from a Stata label set to a variable. If duplicated labels are found, unique labels will be generated according the following scheme: "label_(integer code)". } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), convert.factors=FALSE) # compare vectors set.label(dat, "type") dat$type # German label set.label(dat, "type", "de") } readstata13/man/stbcal.Rd0000644000176200001440000000274413077075424014720 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dbcal.R \name{stbcal} \alias{stbcal} \title{Parse Stata business calendar files} \usage{ stbcal(stbcalfile) } \arguments{ \item{stbcalfile}{\emph{stbcal-file} Stata buisness calendar file created by Stata.} } \value{ Returns a data.frame with two cols: \describe{ \item{range:}{The date matching the buisnesdate. Date format.} \item{buisdays:}{The Stata business calendar day. Integer format.} } } \description{ Create conversion table for business calendar dates. } \details{ Stata 12 introduced business calender format. Business dates are integer numbers in a certain range of days, weeks, months or years. In this range some days are omitted (e.g. weekends or holidays). If a business calendar was created, a stbcal file matching this calendar was created. This file is required to read the business calendar. This parser reads the stbcal- file and returns a data.frame with dates matching business calendar dates. A dta-file containing Stata business dates imported with read.stata13() shows in formats which stdcal file is required (e.g. "%tbsp500" requires sp500.stbcal). Stata allows adding a short description called purpose. This is added as an attribute of the resulting data.frame. } \examples{ sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/set.lang.Rd0000644000176200001440000000173113077075424015156 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{set.lang} \alias{set.lang} \title{Assign Stata Language Labels} \usage{ set.lang(dat, lang = NA, generate.factors = FALSE) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} \item{generate.factors}{\emph{logical.} If \code{TRUE}, missing factor levels are generated.} } \value{ Returns a data.frame with value labels in language "lang". } \description{ Changes default label language for a dataset. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) get.lang(dat) varlabel(dat) # set German label datDE <- set.lang(dat, "de") get.lang(datDE) varlabel(datDE) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/read.dta13.Rd0000644000176200001440000001260013103036420015245 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read.R \name{read.dta13} \alias{read.dta13} \title{Read Stata Binary Files} \usage{ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE, encoding = "UTF-8", fromEncoding = NULL, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, add.rownames = FALSE, nonint.factors = FALSE, select.rows = NULL) } \arguments{ \item{file}{\emph{character.} Path to the dta file you want to import.} \item{convert.factors}{\emph{logical.} If \code{TRUE}, factors from Stata value labels are created.} \item{generate.factors}{\emph{logical.} If \code{TRUE} and convert.factors is TRUE, missing factor labels are created from integers. If duplicated labels are found, unique labels will be generated according the following scheme: "label_(integer code)".} \item{encoding}{\emph{character.} Strings can be converted from Windows-1252 or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify target encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings which can't be displayed in the current locale. Set encoding=NULL to stop reencoding.} \item{fromEncoding}{\emph{character.} We expect strings to be encoded as "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14 or newer "UTF-8" is used. In some situation the used encoding can differ for Stata 14 files and must be manually set.} \item{convert.underscore}{\emph{logical.} If \code{TRUE}, "_" in variable names will be changed to "."} \item{missing.type}{\emph{logical.} Stata knows 27 different missing types: ., .a, .b, ..., .z. If \code{TRUE}, attribute \code{missing} will be created.} \item{convert.dates}{\emph{logical.} If \code{TRUE}, Stata dates are converted.} \item{replace.strl}{\emph{logical.} If \code{TRUE}, replace the reference to a strL string in the data.frame with the actual value. The strl attribute will be removed from the data.frame (see details).} \item{add.rownames}{\emph{logical.} If \code{TRUE}, the first column will be used as rownames. Variable will be dropped afterwards.} \item{nonint.factors}{\emph{logical.} If \code{TRUE}, factors labels will be assigned to variables of type float and double.} \item{select.rows}{\emph{integer.} Vector of one or two numbers. If single value rows from 1:val are selected. If two values of a range are selected the rows in range will be selected.} } \value{ The function returns a data.frame with attributes. The attributes include \describe{ \item{datalabel:}{Dataset label} \item{time.stamp:}{Timestamp of file creation} \item{formats:}{Stata display formats. May be used with \code{\link{sprintf}}} \item{types:}{Stata data type (see Stata Corp 2014)} \item{val.labels:}{For each variable the name of the associated value labels in "label"} \item{var.labels:}{Variable labels} \item{version:}{dta file format version} \item{label.table:}{List of value labels.} \item{strl:}{Character vector with long strings for the new strl string variable type. The name of every element is the identifier.} \item{expansion.fields:}{list providing variable name, characteristic name and the contents of Stata characteristic field.} \item{missing:}{List of numeric vectors with Stata missing type for each variable.} } } \description{ \code{read.dta13} reads a Stata dta-file and imports the data into a data.frame. } \details{ If the filename is a url, the file will be downloaded as a temporary file and read afterwards. Stata files are encoded in ansinew. Depending on your system's default encoding certain characters may appear wrong. Using a correct encoding may fix these. Variable names stored in the dta-file will be used in the resulting data.frame. Stata types char, byte, and int will become integer; float and double will become numerics. R only knows a single missing type, while Stata knows 27, so all Stata missings will become NA in R. If you need to keep track of Statas original missing types, you may use \code{missing.type=TRUE}. Stata dates are converted to R's Date class the same way foreign handles dates. Stata 13 introduced a new character type called strL. strLs are able to store strings up to 2 billion characters. While R is able to store strings of this size in a character vector, the printed representation of such vectors looks rather cluttered, so it's possible to save only a reference in the data.frame with option \code{replace.strl=FALSE}. In R, you may use rownames to store characters (see for instance \code{data(swiss)}). In Stata, this is not possible and rownames have to be stored as a variable. If you want to use rownames, set add.rownames to TRUE. Then the first variable of the dta-file will hold the rownames of the resulting data.frame. Reading dta-files of older and newer versions than 13 was introduced with version 0.8. } \note{ read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members from foreign::read.dta(). } \references{ Stata Corp (2014): Description of .dta file format \url{http://www.stata.com/help.cgi?dta} } \seealso{ \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/LICENSE0000644000176200001440000004315213077075424013411 0ustar liggesusersGNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. {description} Copyright (C) {year} {fullname} This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. {signature of Ty Coon}, 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License.