'GAWK script- Skip rows $6 OR $7 if = "" and STD calculation

I have put togheter in an awk script, the average and standard deviation from 2 columns (weight,height) and the % of olympic medals achieved (sum and group by male/female).

I only lack the std calculation as of right now.

Sometimes either the weight or the height fields have empty values. I would like to skip them if one of them is empty, to not affect the calculations.

$6=height,$7=weight

Header should be:

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals

Imput data to test it:

id,name,nationality,sex,date_of_birth,height,weight,sport,gold,silver,bronze,info
736041664,A Jesus Garcia,ESP,male,1969-10-17,,64,athletics,1,0,0,
435962603,Aaron Brown,USA,male,1992-05-27,1.98,79,athletics,0,1,2,
266237702,Aaron Russell,USA,male,1993-06-04,2.05,98,volleyball,0,0,1,
87689776,Aauri Lorena Bokesa,ESP,female,1988-12-14,1.80,62,athletics,0,1,0,
997877719,Ababel Yeshaneh,ETH,female,1991-07-22,1.65,54,athletics,1,0,0,
343694681,Abadi Hadis,ETH,male,1997-11-06,1.70,63,athletics,0,4,0,
376068084,Abbey D'Agostino,USA,female,1992-05-25,1.61,49,athletics,0,0,1,
162792594,Abbey Weitzeil,USA,female,1996-12-03,1.78,68,aquatics,1,1,0,
803161695,Abdelaziz Merzougui,ESP,male,1991-08-30,1.75,,athletics,1,0,1,

The script is :

BEGIN { FS="," }
NR>1  { medals_all+= ($9 + $10 + $11)          # sum of ALL medals
        if ($3 != country) next                # if not the country of interest then go to next record
        found_country=1
        counts[$4]++                           # count of athletes by sex
        height_sum[$4]+= $6                    # sum of heights by sex
        weight_sum[$4]+= $7                    # sum of weights by sex
        medals_sum[$4]+= ($9 + $10 + $11)      # sum of medals by sex
      }
END   { if (found_country != 1) {
           printf "Sorry, country \"%s\" not found.\n", country
        }
        else {
           print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
           for (sex in counts)
               printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
                      country,sex,
                      (counts[sex]>0) ? (weight_sum[sex]/counts[sex])    : 0,"weight_std",
                      (counts[sex]>0) ? (height_sum[sex]/counts[sex])    : 0,"height_std",
                      (medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
        }
      }

I was thinking of something like:

if ($6 | $7 = "" ) next

But it gives me an error and I don't know where to put it (After END block or before?)



Solution 1:[1]

One awk idea (sans the code for std deviation)

$ cat athletes.awk
BEGIN { FS="," }
NR>1  { medals_all+= ($9 + $10 + $11)          # sum of ALL medals
        if ($3 != country) next                # if not the country of interest then go to next record
        found_country=1
        counts[$4]++                           # count of athletes by sex
        height_sum[$4]+= $6                    # sum of heights by sex
        weight_sum[$4]+= $7                    # sum of weights by sex
        medals_sum[$4]+= ($9 + $10 + $11)      # sum of medals by sex
      }
END   { if (found_country != 1) {
           printf "Sorry, country \"%s\" not found.\n", country
        }
        else {
           print "Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals"
           for (sex in counts)
               printf "%s,%s,%.4f Kg,%s,%.3f m,%s,%.4f%\n",
                      country,sex,
                      (counts[sex]>0) ? (weight_sum[sex]/counts[sex])    : 0,"weight_std",
                      (counts[sex]>0) ? (height_sum[sex]/counts[sex])    : 0,"height_std",
                      (medals_all >0) ? (medals_sum[sex]/medals_all*100) : 0
        }
      }

Testing the script:

$ awk -v country=USA -f athletes.awk athletesv2.csv

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,58.5000 Kg,weight_std,1.695 m,height_std,18.7500%
USA,male,88.5000 Kg,weight_std,2.015 m,height_std,25.0000%

$ awk -v country=ESP -f athletes.awk athletesv2.csv

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62.0000 Kg,weight_std,1.800 m,height_std,6.2500%
ESP,male,65.5000 Kg,weight_std,1.735 m,height_std,18.7500%

$ awk -v country=ETH -f athletes.awk athletesv2.csv

Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,63.0000 Kg,weight_std,1.700 m,height_std,25.0000%
ETH,female,54.0000 Kg,weight_std,1.650 m,height_std,6.2500%

$ awk -v country=XXX -f athletes.awk athletesv2.csv

Sorry, country "XXX" not found.

Solution 2:[2]

This doesn't attempt to do the std deviation calculation and idk how you're getting those medal percent numbers in your expected output but it should be easy for you to tweak this to finish off whatever it is you need to do:

$ cat tst.awk
BEGIN {
    FS = OFS = ","
    OFMT = "%.4f"
}
NR==1 {
    for (i=1; i<=NF; i++) {
        f[$i] = i
    }
    print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
    next
}
$(f["nationality"]) == country_code {
    sex = $(f["sex"])
    ccWeight[sex] += $(f["weight"])
    ccHeight[sex] += $(f["height"])
    ccMedals[sex] += ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
}
END {
    for ( sex in ccWeight ) {
        avgWeight = ccWeight[sex] / ccMedals[sex]
        stdWeight = "foo"
        avgHeight = ccWeight[sex] / ccMedals[sex]
        stdHeight = "bar"
        pctMedals = ( ccMedals[sex] / (NR - 1) ) * 100

        print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
    }
}

$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,33.3333
USA,male,44.2500,foo,44.2500,bar,44.4444

$ awk -v country_code=ESP -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ESP,female,62,foo,62,bar,11.1111
ESP,male,43.6667,foo,43.6667,bar,33.3333

$ awk -v country_code=ETH -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
ETH,male,15.7500,foo,15.7500,bar,44.4444
ETH,female,54,foo,54,bar,11.1111

Here's another possible interpretation of how to calculate average medals given your comment below but it still doesn't produce the output you want so I guess you mean something different again:

$ cat tst.awk
BEGIN {
    FS = OFS = ","
    OFMT = "%.4f"
}
NR==1 {
    for (i=1; i<=NF; i++) {
        f[$i] = i
    }
    print "Country", "Sex", "Weight_avg", "Weight_std", "Height_avg", "Height_std", "% Medals"
    next
}
{
    sex = $(f["sex"])
    numMedals = ( $(f["gold"]) + $(f["silver"]) + $(f["bronze"]) )
    allMedals[sex] += numMedals
}
$(f["nationality"]) == country_code {
    ccWeight[sex] += $(f["weight"])
    ccHeight[sex] += $(f["height"])
    ccMedals[sex] += numMedals
}
END {
    for ( sex in ccWeight ) {
        avgWeight = ccWeight[sex] / ccMedals[sex]
        stdWeight = "foo"
        avgHeight = ccWeight[sex] / ccMedals[sex]
        stdHeight = "bar"
        pctMedals = ( ccMedals[sex] / allMedals[sex] ) * 100

        print country_code, sex, avgWeight, stdWeight, avgHeight, stdHeight, pctMedals
    }
}

$ awk -v country_code=USA -f tst.awk athletesv2.csv
Country,Sex,Weight_avg,Weight_std,Height_avg,Height_std,% Medals
USA,female,39,foo,39,bar,60
USA,male,44.2500,foo,44.2500,bar,36.3636

Solution 3:[3]

A single unified solution that no longer requires the user to manually enter nations one at a time :

  • the PROCINFO[ ] bit is for swapping from mawk to gawk while retaining some semblance of sorted order

==

< test_athletesv2.csv\
\
| WHINY_USERS=1 mawk '
  function sum(_,__,___) { 
      if(+__<-__) { 
           return sprintf(OFMT,$_) 
      }
      ___=""
      for(_;_<=__;_+=3) {
         ___+=$_ }
      return +___ 
   } 
   function mean(_,__) {
       return \
          sprintf(OFMT,
          (+__<-__ ? +$_ :2/__*sum(_,__))\
                          /(100.0^(_==1)))
   }
   function sd(_,__,___,____,_____) {
       if(+__<-__) {
           return "0.0000000"
       }
       ____=""
       _____=100^(_==1)
       for(_;_<=+__;_+=3) {
           ____+=(($_)/_____-___)^2
       }
       return (____/(__/(_=2)))^_--^-_ 
  }
  function printreport(_,__,___) {
      ___=""
      print substr(_,__~__,index(_,"=")-1),
            substr(_,      index(_,"=")+(_=1)),
            ___=mean(_,__),sd(_++,__,___),
            ___=mean(_,__),sd(_++,__,___),
            sprintf("%8.4f-%%",sum(_,__)*100/_______)  
 } 
 BEGIN {  _ = ""
 PROCINFO[    "sorted_in"  \
          ] = "@ind_str_asc";
        ___ = 3
     ______ = " Country,Gender,Weight_avg,Weight_std"\
              ",Height_avg,Height_std,%-Medals"
     SUBSEP =   "="
        OFS = FS = ","
       
 getline } { sub("$",sprintf("=%.f=%.f=%.f", \
                             int(100*$6),$7,-_\
                             +(_+=+$9+$10+$11)),
                      _____[____[$___]=$___,$4]) 
 } END {
      _______ = +_
          ___ =  3
           FS = SUBSEP
      CONVFMT = OFMT ="%13.7f"
 
      for(_ in ____) {
          printf("%s%s%s",ORS,______,ORS)
          for(__ in _____) {
              if(index(__,_)) {
                  $+FS=substr(_____[__],—-___)
                  printreport(__,(-!!___)^(NF==++___)*NF) 
              } 
          }
      }
 }' | column -s',' -t | column -t |   lgp3 3 

Country  Gender  Weight_avg  Weight_std  Height_avg  Height_std  %-Medals
ESP      female  1.8000000   0.0000000   62.0000000  0.0000000   6.2500-%
ESP      male    1.1566667   0.4723660   43.6666667  17.8688639  18.7500-%

Country  Gender  Weight_avg  Weight_std  Height_avg  Height_std  %-Medals
ETH      female  1.6500000   0.0000000   54.0000000  0.0000000   6.2500-%
ETH      male    1.7000000   0.0000000   63.0000000  0.0000000   25.0000-%

Country  Gender  Weight_avg  Weight_std  Height_avg  Height_std  %-Medals
USA      female  1.1300000   0.4665119   39.0000000  17.7106371  18.7500-%
USA      male    1.3400000   0.5476008   59.0000000  25.3048085  25.0000-%

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1
Solution 2
Solution 3