hduser@benjamin-VirtualBox:~/data$ hduser@benjamin-VirtualBox:~/data$ ls DataDict.txt FIPS_CountyName.txt smplReducer1-Original.py smplReducer2Original.py smplReducer.py DataSet.txt smplMapper.py smplReducer1.py smplReducer2.py hduser@benjamin-VirtualBox:~/data$ cat smplMapper.py #!/usr/bin/env python import sys import re # input comes from STDIN (standard input) for line in sys.stdin: try: #sometimes bad data can cause errors use this how you like to deal with lint and bad data cid = "z" #default sorted as first county = "z" #default sorted as first state2digit = "z" #default sorted as first population = "z" #default sorted as first # remove leading and trailing whitespace line = line.strip() #splits = line.split("|") # finds lines from DataSet.txt, outputs first 2 items as array if line.count(",")>5: splits = line.split(",", 2)[0:2] # removes the country and the state lines from DataSet if re.match('..000',splits[0]): splits= "" # finds lines from Fips_CountyName.txt, cleans them, outputs 3 items as array else: # replaces a comma followed by space (state) w/comma splits = line.replace(', ',',') # replaces all remaining spaces w/comma splits = splits.replace(" ",",",1) splits = splits.split(",") #print splits if len(splits) == 2: cid = splits[0] population = splits[1] else: #people data cid = splits[0] county = splits[1] state2digit = splits[2] print '%s^%s^%s^%s' % (cid, county, state2digit, population) except: #errors are going to make your job fail which you may or may not want pass hduser@benjamin-VirtualBox:~/data$