Machine Learning/kdd sample

From Noisebridge
Jump to navigation Jump to search
# get a random subsample of students from the training set

use strict;
use warnings;

use Getopt::Long;
use File::Basename;

my $numItems=1000;
my $method="random";
my $type="students";
my $help="";

GetOptions ('numitems=s' => \$numItems,
			'method=s' => \$method,
			'type=s' => \$type,
			'h' => \$help);

my $inputFile=shift(@ARGV);
if (not($inputFile)) {
	$help=1;
}

my $progname=basename($0);

if ($help) {
	print "This program will sample a tab-separated txt file of students.\n";
	print "It can be used to get all examples per student (for a number of students).\n";
	print "\n";
	print "Basic usage:\n";
	print "$progname <input file>\n";
	print "\n";
	print "Full usage:\n";
	print "$progname [-numitems <number of items>] [-method <'random'|'first'>] [-type <'students'>] <input file>\n";
	print "\n";
	print "Examples:\n";
	print "$progname algebra_2008_2009_train.txt\n";
	print "  by default, will create a sample of 1000 random students (all examples on those students)\n";
	print "$progname -numitems 20000 algebra_2008_2009_train.txt\n";
	print "  create a sample of 20000 random students\n";
	print "$progname -type students -method first algebra_2008_2009_train.txt\n";
	print "  create a sample of the first 1000 students\n";
	exit(0);
}

print "Type '$progname -h' to get the help\n";
my $directory="download";
if (not(-e $directory)) {
	$directory=".";
}
my $outputFile="${inputFile}_sample_${numItems}_${method}_${type}.csv";
print "Getting $numItems $method $type, putting in $outputFile\n";

# get the list of possible ids
my $sourceIdFile="";
my $idIndex=1;
my %names=();
my %sourceIds=();
my @sourceIds=();
if ($type eq "students") {
	$sourceIdFile="$directory/studentinfo.csv";
	if (not (-e $sourceIdFile)) {
		open INPUT, $inputFile;
		open OUTPUT, ">$sourceIdFile";
		while(defined(my $line = <INPUT>)) {
			chomp($line);
			my @values=split("\t",$line);
			my $id = $values[$idIndex];
			if (not(defined($sourceIds{$id}))) {
				print OUTPUT "$id\n";
			}
			$sourceIds{$id} = 1;
		}
		close OUTPUT;
		close INPUT;
		@sourceIds = keys %sourceIds;
	} else {
		open INPUT, $sourceIdFile;
		while (defined(my $line=<INPUT>)) {
			chomp($line);
			push @sourceIds, $line;
		}
		close INPUT;
	}
}

# get the list of ids to pull
my %idsWanted=();
my $numFound=0;
while ($numFound < $numItems) {
	my $id=1;
	if ($method eq "first") {
		$id=shift(@sourceIds);
	} else {
		my $index=int(rand(scalar(@sourceIds)));
		$id=$sourceIds[$index];
		# remove that id from the source array
		splice(@sourceIds,$index,1)
	}
	$idsWanted{$id}=1;
	$numFound++;
}
print "Pulling $type ids (found " . scalar(keys %idsWanted) . ":\n";
#my @sortedIds=sort(keys(%idsWanted));
print "This could take a while...\n";

# go through the list and pull those lines
open INPUT, $inputFile;
open OUTPUT, ">$outputFile";
# check first line for header
my $line=<INPUT>;
chomp($line);
if ($line =~ /Student Id/) {
	print OUTPUT "$line\n";
} else {
	print "Er...no header...\n$line\n";
	my @values=split(/\t/,$line);
	if ($idsWanted{$values[$idIndex]}) {
		print OUTPUT "$line\n";
	}
}
my $lineNum=1;
# now go through the rest of the lines
while (defined(my $line=<INPUT>)) {
	chomp($line);
	my @values=split(/\t/,$line);
	if ($idsWanted{$values[$idIndex]}) {
		print OUTPUT "$line\n";
	}
	if ($lineNum % 100000 == 0) {
		my $percent=100 * $lineNum/8918055;
		print "...line $lineNum ($percent %): " . $values[1] . "\n";
	}
	$lineNum++;
}
close OUTPUT;
close INPUT;


# Do the same for the test file
my $test_input_file = $inputFile;
$test_input_file =~ s/train/test/;
my $output_test_file = "${test_input_file}_sample_${numItems}_${method}_${type}.csv";
open INPUT, $test_input_file;
open OUTPUT, ">$output_test_file";
# check first line for header
$line=<INPUT>;
chomp($line);
if ($line =~ /Student Id/) {
	print OUTPUT "$line\n";
} else {
	print "Er...no header...\n$line\n";
	my @values=split(/\t/,$line);
	if ($idsWanted{$values[$idIndex]}) {
		print OUTPUT "$line\n";
	}
}
$lineNum=1;
# now go through the rest of the lines
while (defined(my $line=<INPUT>)) {
	chomp($line);
	my @values=split(/\t/,$line);
	if ($idsWanted{$values[$idIndex]}) {
		print OUTPUT "$line\n";
	}
	if ($lineNum % 100000 == 0) {
		my $percent=100 * $lineNum/508913;
		print "...line $lineNum ($percent %): " . $values[1] . "\n";
	}
	$lineNum++;
}
close OUTPUT;
close INPUT;

exit(0);